\documentclass{acm_proc_article-sp} \usepackage[utf8]{inputenc} \usepackage{amsmath,amsfonts} \usepackage{algorithm} \usepackage{algpseudocode} \newtheorem{lemma}{Lemma} \newtheorem{fact}{Fact} \newtheorem{example}{Example} \newtheorem{prop}{Proposition} \newtheorem{theorem}{Theorem} \newcommand*{\defeq}{\stackrel{\text{def}}{=}} \newcommand{\var}{\mathop{\mathrm{Var}}} \newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]} \newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} \newcommand{\norm}[1]{\lVert#1\rVert} \newcommand{\tr}[1]{#1^*} \newcommand{\ip}[2]{\langle #1, #2 \rangle} \newcommand{\mse}{\mathop{\mathrm{MSE}}} \DeclareMathOperator{\trace}{tr} \DeclareMathOperator*{\argmax}{arg\,max} \title{Budgeted mechanism for optimal experiment design} \begin{document} \maketitle \section{Intro} \begin{itemize} \item already existing field of experiment design: survey-like setup, what are the best points to include in your experiment? Measure of the usefulness of the data: variance-reduction or entropy-reduction. \item nowadays, there is also a big focus on purchasing data: paid surveys, mechanical turk, etc. that add economic aspects to the problem of experiment design \item recent advances (Singer, Chen) in the field of budgeted mechanisms \item we study ridge regression, very widely used in statistical learning, and treat it as a problem of budgeted experiment design \item we make the following contributions: ... \item extension to a more general setup which includes a wider class of machine learning problems \end{itemize} \section{Problem formulation} \subsection{Notations} \begin{itemize} \item If $x\in\mathbf{R}^d$, $x^*$ will denote the transpose of vector $x$. If $(x, y)\in\mathbf{R}^d\times\mathbf{R}^d$, $x^*y$ is the standard inner product of $\mathbf{R}^d$. \item We will often use the following order over symmetric matrices: if $A$ and $B$ are two $d\times d$ real symmetric matrices, we write that $A\leq B$ iff: \begin{displaymath} \forall x\in\mathbf{R}^d,\quad x^*Ax \leq x^*Bx \end{displaymath} That is, iff $B-A$ is symmetric semi-definite positive. \item Let us recall this property of the ordered defined above: if $A$ and $B$ are two symmetric definite positive matrices such that $A\leq B$, then $A^{-1}\leq B^{-1}$. \end{itemize} \subsection{Data model} There is a set of $n$ users, $\mathcal{N} = \{1,\ldots, n\}$. Each user $i\in\mathcal{N}$ has a public vector of features $x_i\in\mathbf{R}^d$ and an undisclosed piece of information $y_i\in\mathbf{R}$. We assume that the data has already been normalized so that $\norm{x_i}_2\leq 1$ for all $i\in\mathcal{N}$. The experimenter is going to select a set of users and ask them to reveal their private piece of information. We are interested in a \emph{survey setup}: the experimenter has not seen the data yet, but he wants to know which users he should be selecting. His goal is to learn the model underlying the data. Here, we assume a linear model: \begin{displaymath} \forall i\in\mathcal{N},\quad y_i = \beta^* x_i + \varepsilon_i \end{displaymath} where $\beta\in\mathbf{R}^d$ and $\varepsilon_i\in\mathbf{R}$ follows a normal distribution of mean $0$ and variance $\sigma^2$. After observing the data, the experimenter could simply do linear regression to learn the model parameter $\beta$. However, in a more general setup, the experimenter has a prior knowledge about $\beta$, a distribution over $\mathbf{R}^d$. After observing the data, the experimenter performs \emph{maximum a posteriori estimation}: computing the point which maximizes the posterior distribution of $\beta$ given the observations. Here, we will assume, as it is often done, that the prior distribution is a multivariate normal distribution of mean zero and covariance matrix $\kappa I_d$. Maximum a posteriori estimation leads to the following maximization problem: \begin{displaymath} \beta_{\text{max}} = \argmax_{\beta\in\mathbf{R}^d} \sum_i (y_i - \beta^*x_i)^2 + \frac{1}{\mu}\sum_i \norm{\beta}_2^2 \end{displaymath} which is the well-known \emph{ridge regression}. $\mu = \frac{\kappa}{\sigma^2}$ is the regularization parameter. Ridge regression can thus be seen as linear regression with a regularization term which prevents $\beta$ from having a large $L_2$-norm. \begin{itemize} \item set of $n$ users $\mathcal{N} = \{1,\ldots, n\}$ \item each user has a public vector of features $x_i\in\mathbf{R}^d$ and some undisclosed variable $y_i\in\mathbf{R}$ \item we assume that the data has been normalized: $\forall i\in\mathcal{N}$, $\norm{x_i}\leq 1$ \item \textbf{Ridge regression model:} \begin{itemize} \item $y_i = \beta^*x_i + \varepsilon_i$ \item $\varepsilon_i \sim \mathcal{N}(0,\sigma^2)$, $(\varepsilon_i)_{i\in \mathcal{N}}$ are mutually independent. \item prior knowledge of $\beta$: $\beta\sim\mathcal{N}(0,\kappa I_d)$ \item $\mu = \frac{\kappa}{\sigma^2}$ is the regularization factor. \item after the variables $y_i$ are disclosed, the experimenter does \emph{maximum a posteriori estimation} which is equivalent under Gaussian prior to solving the ridge regression maximisation problem: \begin{displaymath} \beta_{\text{max}} = \argmax \sum_i (y_i - \beta^*x_i)^2 + \frac{1}{\mu}\sum_i \norm{\beta}_2^2 \end{displaymath} \end{itemize} \end{itemize} \subsection{Economics} Value function: following the experiment design theory, the value of data is the decrease of uncertainty about the model. Common measure of uncertainty, the entropy. Thus, the value of a set $S$ of users is: \begin{displaymath} V(S) = \mathbb{H}(\beta) - \mathbb{H}(\beta|Y_S) \end{displaymath} where $Y_S = \{y_i,\,i\in S\}$ is the set of observations. In our case (under Gaussian prior): \begin{align*} \forall S\subset\mathcal{N},\; V(S) & = \frac{1}{2}\log\det\left(I_d + \mu\sum_{i\in S} x_ix_i^*\right)\\ & \defeq \frac{1}{2}\log\det A(S) \end{align*} \begin{lemma}[Marginal contribution] \begin{displaymath} \Delta_i V(S)\defeq V(S\cup\{i\}) - V(S) = \frac{1}{2}\log\left(1 + \mu x_i^*A(S)^{-1}x_i\right) \end{displaymath} \end{lemma} \begin{proof} We have: \begin{align*} V(S\cup\{i\}) & = \frac{1}{2}\log\det A(S\cup\{i\})\\ & = \frac{1}{2}\log\det\left(A(S) + \mu x_i x_i^*\right)\\ & = V(S) + \frac{1}{2}\log\det\left(I_d + \mu A(S)^{-1}x_i x_i^*\right)\\ & = V(S) + \frac{1}{2}\log\left(1 + \mu x_i^* A(S)^{-1}x_i\right) \end{align*} where the last equality comes from Sylvester's determinant formula. \end{proof} \begin{itemize} \item each user $i$ has a cost $c_i$ \item the auctioneer has a budget constraint $B$ \item optimisation problem: \begin{displaymath} OPT(V,\mathcal{N}, B) = \max_{S\subset\mathcal{N}} \left\{ V(S)\,|\, \sum_{i\in S}c_i\leq B\right\} \end{displaymath} \end{itemize} Problem, the costs are reported by users, they can lie. It is an auction problem. Goal design a mechanism (selection and payment scheme) which is: \begin{itemize} \item truthful \item individually rational \item budget feasible \item has a good approximation ratio \end{itemize} \section{Main result} \begin{algorithm}\label{mechanism} \caption{Mechanism for ridge regression} \begin{algorithmic}[1] \State $i^* \gets \argmax_{j\in\mathcal{N}}V(j)$ \State $x^* \gets \argmax_{x\in[0,1]^n} \{L_{\mathcal{N}\setminus\{i^*\}}(x) \,|\, c(x)\leq B\}$ \Statex \If{$L(x^*) < CV(i^*)$} \State \textbf{return} $\{i^*\}$ \Else \State $i \gets \argmax_{1\leq j\leq n}\frac{V(j)}{c_j}$ \State $S \gets \emptyset$ \While{$c_i\leq \frac{B}{2}\frac{V(S\cup\{i\})-V(S)}{V(S\cup\{i\})}$} \State $S \gets S\cup\{i\}$ \State $i \gets \argmax_{j\in\mathcal{N}\setminus S} \frac{V(S\cup\{j\})-V(S)}{c_j}$ \EndWhile \State \textbf{return} $S$ \EndIf \end{algorithmic} \end{algorithm} Choosing $C=...$ we have: \begin{theorem} The mechanism is truthful, individually rational, budget feasible. Furthermore, by choosing $C= ...$ we get an approximation ratio of: \begin{multline*} 1 + C^* = \frac{5e-1 + C_\mu(4e-1)}{2C_\mu(e-1)}\\ + \frac{\sqrt{C_\mu^2(1+2e)^2 + 2C_\mu(14e^2+5e+1) + (1-5e)^2}}{2C_\mu(e-1)} \end{multline*} \end{theorem} \section{General setup} \section{Conclusion} \end{document}