From f28514e9eb3ed56091b3d10b8f6efc3b91a0e2b6 Mon Sep 17 00:00:00 2001
From: Stratis Ioannidis <stratis@stratis-Latitude-E6320.(none)>
Date: Wed, 3 Jul 2013 07:04:36 -0700
Subject: up to myerson

---
 problem.tex | 72 ++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 48 insertions(+), 24 deletions(-)

(limited to 'problem.tex')

diff --git a/problem.tex b/problem.tex
index a5db2ac..c05e20c 100644
--- a/problem.tex
+++ b/problem.tex
@@ -3,7 +3,7 @@
 
 The theory of experimental design \cite{pukelsheim2006optimal,atkinson2007optimum,chaloner1995bayesian} considers the following formal setting. %  studies how an experimenter \E\  should select the parameters of a set of experiments she is about to conduct. In general, the optimality of a particular design depends on the purpose of the experiment, \emph{i.e.}, the quantity \E\  is trying to learn or the hypothesis she is trying to validate. Due to their ubiquity in statistical analysis, a large literature on the subject focuses on learning \emph{linear models}, where \E\ wishes to  fit  a linear function to the data she has collected.
 %Putting cost considerations aside
-Suppose that an experimenter \E\ wishes to conduct $k$ among $n$ possible experiments. Each experiment $i\in\mathcal{N}\defeq \{1,\ldots,n\}$ is associated with a set of parameters (or features) $x_i\in \reals^d$, normalized so that $\|x_i\|_2\leq 1$. Denote by $S\subseteq \mathcal{N}$, where $|S|=k$, the set of experiments selected; upon its execution, experiment $i\in S$ reveals an output variable (the ``measurement'') $y_i$,  related to the experiment features $x_i$ through a linear function, \emph{i.e.},
+Suppose that an experimenter \E\ wishes to conduct $k$ among $n$ possible experiments. Each experiment $i\in\mathcal{N}\defeq \{1,\ldots,n\}$ is associated with a set of parameters (or features) $x_i\in \reals^d$, normalized so that $$b\leq \|x_i\|_2\leq 1,$$ for some $b>0$. Denote by $S\subseteq \mathcal{N}$, where $|S|=k$, the set of experiments selected; upon its execution, experiment $i\in S$ reveals an output variable (the ``measurement'') $y_i$,  related to the experiment features $x_i$ through a linear function, \emph{i.e.},
 \begin{align}\label{model}
      \forall i\in\mathcal{N},\quad y_i = \T{\beta} x_i + \varepsilon_i
 \end{align}
@@ -19,7 +19,7 @@ etc.). The magnitude of the coefficient $\beta_i$ captures the effect that featu
 The purpose of these experiments is to allow \E\  to estimate the model $\beta$. In particular,
  assume that the experimenter  \E\ has a {\em prior}
 distribution on $\beta$, \emph{i.e.},  $\beta$ has a multivariate normal prior
-with zero mean  and covariance $\sigma^2R^{-1}\in \reals^{d^2}$ (where $\sigma^2$ is the noise variance). 
+with zero mean  and covariance $\sigma^2R^{-1}\in \reals^{d^2}$ (where $\sigma^2$ is the noise variance and the prior on $\beta$). 
 Then, \E\ estimates $\beta$ through \emph{maximum a posteriori estimation}: \emph{i.e.}, finding the parameter which maximizes the posterior distribution of $\beta$ given the observations $y_S$. Under the linearity assumption \eqref{model} and the Gaussian prior on $\beta$, maximum a posteriori estimation leads to the following maximization \cite{hastie}: 
 \begin{align}
     \hat{\beta} = \argmax_{\beta\in\reals^d} \prob(\beta\mid y_S) =\argmin_{\beta\in\reals^d} \big(\sum_{i\in S} (y_i - \T{\beta}x_i)^2
@@ -33,17 +33,12 @@ This optimization, commonly known as \emph{ridge regression}, includes an additi
 %\begin{align} \hat{\beta} &=\max_{\beta\in\reals^d}\prob(y_S;\beta) =\argmin_{\beta\in\reals^d } \sum_{i\in S}(\T{\beta}x_i-y_i)^2 \nonumber\\
 %& = (\T{X_S}X_S)^{-1}X_S^Ty_S\label{leastsquares}\end{align} 
 %The estimator $\hat{\beta}$ is unbiased, \emph{i.e.}, $\expt{\hat{\beta}} = \beta$ (where the expectation is over the noise variables $\varepsilon_i$). Furthermore, $\hat{\beta}$ is a multidimensional normal random variable with mean $\beta$ and covariance matrix $(X_S\T{X_S})^{-1}$. 
-Note that the estimator $\hat{\beta}$ is a linear map of $y_S$; as $y_S$ is
-a multidimensional normal r.v., so is $\hat{\beta}$ (the randomness coming from
-the noise terms $\varepsilon_i$). 
-%In particular, $\hat{\beta}$ has %mean $\beta$% (\emph{i.e.}, it is an \emph{unbiased estimator}) and
-% covariance
-%$(R+\T{X_S}X_S)^{-1}$.
+%Note that the estimator $\hat{\beta}$ is a linear map of $y_S$; as $y_S$ is a multidimensional normal r.v., so is $\hat{\beta}$ (the randomness coming from the noise terms $\varepsilon_i$). 
+%In particular, $\hat{\beta}$ has %mean $\beta$% (\emph{i.e.}, it is an \emph{unbiased estimator}) and covariance $(R+\T{X_S}X_S)^{-1}$.
 
 Let $V:2^\mathcal{N}\to\reals$ be a \emph{value function}, quantifying how informative a set of experiments $S$ is in estimating $\beta$. The classical experimental design problem amounts to finding a set $S$ that maximizes $V(S)$ subject to the constraint $|S|\leq k$. 
-A variety of different value functions are used in literature~\cite{pukelsheim2006optimal}.
-%; almost all make use of the covariance $(R+\T{X_S}X_S)^{-1}$ of the estimator $\hat{\beta}$. 
-A value function that has natural advantages is the \emph{information gain}:  %\emph{$D$-optimality criterion}: %which yields the following optimization problem
+A variety of different value functions are used in literature~\cite{pukelsheim2006optimal,boyd2004convex}; %; almost all make use of the covariance $(R+\T{X_S}X_S)^{-1}$ of the estimator $\hat{\beta}$. 
+one that has natural advantages is the \emph{information gain}:  %\emph{$D$-optimality criterion}: %which yields the following optimization problem
 \begin{align}
    V(S)= I(\beta;y_S) = \entropy(\beta)-\entropy(\beta\mid y_S). \label{informationgain}
 \end{align}
@@ -57,7 +52,13 @@ Under the linear model \eqref{model}, and the Gaussian prior, the information ga
 \end{align}
 This value function is known in the experimental design literature as the
 $D$-optimality criterion
-\cite{pukelsheim2006optimal,atkinson2007optimum,chaloner1995bayesian}.
+\cite{pukelsheim2006optimal,atkinson2007optimum,chaloner1995bayesian}. 
+
+Note that the estimator $\hat{\beta}$ is a linear map of $y_S$; as $y_S$ is a multidimensional normal r.v., so is $\hat{\beta}$ (the randomness coming from the noise terms $\varepsilon_i$). 
+In particular, $\hat{\beta}$ has %mean $\beta$% (\emph{i.e.}, it is an \emph{unbiased estimator}) and 
+covariance $(R+\T{X_S}X_S)^{-1}$. As such, maximizing $V(S)$ can alternatively be seen as a means of reducing the uncertainty on estimator $\hat{\beta}$ my minimizing the product of the eigenvalues of its covariance.
+
+%An alternative interpretation, given that $(R+ \T{X_S}X_S)^{-1}$ is the covariance of the estimator $\hat{\beta}$, is that it tries to minimize the 
 %which is indeed a function of the covariance matrix $(R+\T{X_S}X_S)^{-1}$.
 %defined as $-\infty$ when $\mathrm{rank}(\T{X_S}X_S)<d$. 
 %As $\hat{\beta}$ is a multidimensional normal random variable, the
@@ -101,6 +102,12 @@ In the full-information case, the experiment costs are common knowledge; as such
 \end{align}\label{edp}
 \end{subequations}
 \end{center}
+We denote by
+\begin{equation}\label{eq:non-strategic}
+    OPT = \max_{S\subseteq\mathcal{N}} \Big\{V(S) \;\Big| \;
+    \sum_{i\in S}c_i\leq B\Big\}
+\end{equation}
+the optimal value achievable in the full-information case. 
 \EDP{}, as defined above, is  NP-hard; to see this, note that \textsc{Knapsack}
 reduces to EDP with dimension $d=1$ by mapping the weight of each item, say,
 $w_i$, to an experiment with $x_i^2=w_i$.% Moreover, \eqref{modified} is submodular, monotone and satisfies $V(\emptyset) = 0$.
@@ -111,15 +118,32 @@ also monotone, \emph{i.e.}, $V(S)\leq V(T)$ for all $S\subset T$, with
 $V(\emptyset)=0$. Finally, it is non-negative, \emph{i.e.}, $V(S)\geq 0$
 for all $S\subseteq\mathcal{N}$, since the matrix $\T{X_S}X_S$ is positive semi-definite for all $S\subseteq \mathcal{N}$.
 %Finally, we define the strategic version  of \EDP{} (denoted by \SEDP) by adding the additional constraints of truthfulness, individual rationality, and normal payments, as described in the previous section.
- We denote by
-\begin{equation}\label{eq:non-strategic}
-    OPT = \max_{S\subseteq\mathcal{N}} \Big\{V(S) \;\Big| \;
-    \sum_{i\in S}c_i\leq B\Big\}
+The above three properties imply that a greedy algorithm yields a constant approximation ratio to \EDP.
+In particular, consider the greedy algorithm in which, for
+%In the full-information case, submodular maximization under a budget constraint
+%\eqref{eq:non-strategic} relies on a greedy heuristic in which elements are
+%added to the solution set according to the following greedy selection rule.
+ $S\subseteq\mathcal{N}$ the set constructed thus far, the next
+element $i$  included is the one which maximizes the
+\emph{marginal-value-per-cost}, \emph{i.e.},
+\begin{align}
+    i = \argmax_{j\in\mathcal{N}\setminus S}\frac{V(S\cup\{i\}) - V(S)}{c_i}\label{greedy}
+\end{align}
+This is repeated until adding an elements in $S$ exceeds the budget
+ $B$. Denote by $S_G$ the set constructed by this heuristic and let
+$i^*=\argmax_{i\in\mathcal{N}} V(\{i\})$ be the element of maximum singleton value. Then,
+the following algorithm:
+\begin{equation}\label{eq:max-algorithm}
+\textbf{if}\; V(\{i^*\}) \geq V(S_G)\; \textbf{return}\; \{i^*\}
+\;\textbf{else return}\; S_G
 \end{equation}
-the optimal value achievable in the full-information case. 
+yields an approximation ratio  of $\frac{5 e }{e-1}~$\cite{singer-mechanisms}; this can be further improved to $\frac{e}{e-1}$ using more complicated greedy set constructions~\cite{krause-submodular,sviridenko-submodular}.
+
+
+
 
-\subsection{Budget-Feasible Experimental Design: Strategic Case}
-We study the strategic case, in wich the costs $c_i$ are {\em not} common knowledge and their reporting can be manipulated by the experiment subjects. The latter are strategic and wish to maximize their utility, which is the difference of the payment they receive and their true cost. We note that, though subjects may misreport $c_i$, they cannot lie about $x_i$ (\emph{i.e.}, all public features are verifiable prior to the experiment) nor $y_i$ (\emph{i.e.}, the subject cannot falsify her measurement).
+ \subsection{Budget-Feasible Experimental Design: Strategic Case}
+Rather than the above full information case, we study the strategic case, in which the costs $c_i$ are {\em not} common knowledge and their reporting can be manipulated by the experiment subjects. The latter are strategic and wish to maximize their utility, which is the difference of the payment they receive and their true cost. We note that, though subjects may misreport $c_i$, they cannot lie about $x_i$ (\emph{i.e.}, all public features are verifiable prior to the experiment) nor $y_i$ (\emph{i.e.}, the subject cannot falsify her measurement).
 %The subjects report $c_i$ in order to maximize their expected {\em utility} which is $p_i-c_i$.
 
 
@@ -144,17 +168,17 @@ $p:\reals_+^n\to \reals_+^n$. Given the
 vector of costs $c=[c_i]_{i\in\mathcal{N}}$, the allocation function $S$ determines the set in
 $ \mathcal{N}$ of experiments to be purchased, while the payment function
 returns a vector of payments $[p_i(c)]_{i\in \mathcal{N}}$.
- Let $s_i(c) = \id_{i\in S(c)}$ be the binary indicator of  $i\in S(c)$. As usual, we seek mechanisms that have the following properties \cite{singer-mechanisms}:
+ Let $s_i(c) = \id_{i\in S(c)}$ be the binary indicator of  $i\in S(c)$. We seek mechanisms that have the following properties \cite{singer-mechanisms}:
 \begin{itemize}
 \item \emph{Normalization.} Unallocated experiments receive zero payments: $s_i(c)=0\text{ implies }p_i(c)=0.\label{normal}$
 \item\emph{Individual Rationality.} Payments for allocated experiments exceed
 costs: $p_i(c)\geq c_i\cdot s_i(c).\label{ir}$
 \item \emph{No Positive Transfers.} Payments are non-negative: $p_i(c)\geq 0\label{pt}$.
-\item \emph{Truthfulness/Incentive Compatibility.} Reporting her true cost is
-a dominant strategy for an experiment subject. Formally, let $c_{-i}$
+\item \emph{$\delta$-Truthfulness.} Reporting one's true cost is
+a $\delta$-dominant strategy. Formally, let $c_{-i}$
  be a vector of costs of all agents except $i$. Then, $p_i(c_i,c_{-i})
  - s_i(c_i,c_{-i})\cdot c_i \geq p_i(c_i',c_{-i}) - s_i(c_i',c_{-i})\cdot c_i,
- \label{truthful}$ for every $i \in \mathcal{N}$ and every two cost vectors $(c_i,c_{-i})$ and $(c_i',c_{-i})$.
+ \label{truthful}$ for every $i \in \mathcal{N}$ and every two cost vectors $(c_i,c_{-i})$ and $(c_i',c_{-i})$ such that $|c_i-c_i'|>\delta.$ The mechanism is \emph{truthful} if $\delta=0$.
  \item \emph{Budget Feasibility.} The sum of the payments should not exceed the
     budget constraint, \emph{i.e.} $\sum_{i\in\mathcal{N}} p_i(c) \leq
     B.\label{budget}$
@@ -186,7 +210,7 @@ Given that the full information problem \EDP{} is NP-hard, we further seek mecha
 
 As noted in \cite{singer-mechanisms, chen}, budget feasible reverse auctions are  \emph{single parameter} auctions: each agent has only one
 private value (namely, $c_i$). As such, Myerson's Theorem \cite{myerson} gives
-a characterization of truthful mechanisms.
+a characterization of truthful mechanisms. NEEDS TO BE FIXED
 
 \begin{lemma}[\citeN{myerson}]\label{thm:myerson}
 \sloppy A normalized mechanism $\mathcal{M} = (S,p)$ for a single parameter auction is
-- 
cgit v1.2.3-70-g09d2