\documentclass{acm_proc_article-sp}
\usepackage[utf8]{inputenc}
\usepackage{amsmath,amsfonts}
\usepackage{algorithm}
\usepackage{algpseudocode}
\newtheorem{lemma}{Lemma}
\newtheorem{fact}{Fact}
\newtheorem{example}{Example}
\newtheorem{prop}{Proposition}
\newtheorem{theorem}{Theorem}
\newcommand*{\defeq}{\stackrel{\text{def}}{=}}
\newcommand{\var}{\mathop{\mathrm{Var}}}
\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]}
\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]}
\newcommand{\norm}[1]{\lVert#1\rVert}
\newcommand{\tr}[1]{#1^*}
\newcommand{\ip}[2]{\langle #1, #2 \rangle}
\newcommand{\mse}{\mathop{\mathrm{MSE}}}
\DeclareMathOperator{\trace}{tr}
\DeclareMathOperator*{\argmax}{arg\,max}
\title{Budgeted mechanism for optimal experiment design}
\begin{document}

\maketitle

\section{Intro}

\begin{itemize}
    \item already existing field of experiment design: survey-like setup, what
    are the best points to include in your experiment? Measure of the
    usefulness of the data: variance-reduction or entropy-reduction.
    \item nowadays, there is also a big focus on purchasing data: paid surveys,
    mechanical turk, etc. that add economic aspects to the problem of
    experiment design
    \item recent advances (Singer, Chen) in the field of budgeted mechanisms
    \item we study ridge regression, very widely used in statistical learning,
    and treat it as a problem of budgeted experiment design
    \item we make the following contributions: ...
    \item extension to a more general setup which includes a wider class of
    machine learning problems
\end{itemize}

\section{Problem formulation}

\subsection{Notations}

\begin{itemize}
    \item If $x\in\mathbf{R}^d$, $x^*$ will denote the transpose of vector $x$.
        If $(x, y)\in\mathbf{R}^d\times\mathbf{R}^d$, $x^*y$ is the standard
        inner product of $\mathbf{R}^d$.
    \item We will often use the following order over symmetric matrices: if $A$
        and $B$ are two $d\times d$ real symmetric matrices, we write that
        $A\leq B$ iff:
        \begin{displaymath}
            \forall x\in\mathbf{R}^d,\quad x^*Ax \leq x^*Bx 
        \end{displaymath}
        That is, iff $B-A$ is symmetric semi-definite positive.
    \item Let us recall this property of the ordered defined above: if $A$ and
        $B$ are two symmetric definite positive matrices such that $A\leq B$,
        then $A^{-1}\leq B^{-1}$.
\end{itemize}

\subsection{Data model}

There is a set of $n$ users, $\mathcal{N} = \{1,\ldots, n\}$. Each user
$i\in\mathcal{N}$ has a public vector of features $x_i\in\mathbf{R}^d$ and an
undisclosed piece of information $y_i\in\mathbf{R}$. We assume that the data
has already been normalized so that $\norm{x_i}_2\leq 1$ for all
$i\in\mathcal{N}$.

The experimenter is going to select a set of users and ask them to reveal their
private piece of information. We are interested in a \emph{survey setup}: the
experimenter has not seen the data yet, but he wants to know which users he
should be selecting. His goal is to learn the model underlying the data. Here,
we assume a linear model:
\begin{displaymath}
    \forall i\in\mathcal{N},\quad y_i = \beta^* x_i + \varepsilon_i
\end{displaymath}
where $\beta\in\mathbf{R}^d$ and $\varepsilon_i\in\mathbf{R}$ follows a normal
distribution of mean $0$ and variance $\sigma^2$.

After observing the data, the experimenter could simply do linear regression to
learn the model parameter $\beta$. However, in a more general setup, the
experimenter has a prior knowledge about $\beta$, a distribution over
$\mathbf{R}^d$. After observing the data, the experimenter performs
\emph{maximum a posteriori estimation}: computing the point which maximizes the
posterior distribution of $\beta$ given the observations.

Here, we will assume, as it is often done, that the prior distribution is
a multivariate normal distribution of mean zero and covariance matrix $\kappa
I_d$. Maximum a posteriori estimation leads to the following maximization
problem:
\begin{displaymath}
    \beta_{\text{max}} = \argmax_{\beta\in\mathbf{R}^d} \sum_i (y_i - \beta^*x_i)^2
    + \frac{1}{\mu}\sum_i \norm{\beta}_2^2
\end{displaymath}
which is the well-known \emph{ridge regression}. $\mu
= \frac{\kappa}{\sigma^2}$ is the regularization parameter. Ridge regression
can thus be seen as linear regression with a regularization term which
prevents $\beta$ from having a large $L_2$-norm.


\begin{itemize}
    \item set of $n$ users $\mathcal{N} = \{1,\ldots, n\}$
    \item each user has a public vector of features $x_i\in\mathbf{R}^d$ and some
        undisclosed variable $y_i\in\mathbf{R}$
    \item we assume that the data has been normalized: $\forall
        i\in\mathcal{N}$, $\norm{x_i}\leq 1$
    \item \textbf{Ridge regression model:}
        \begin{itemize}
            \item $y_i = \beta^*x_i + \varepsilon_i$
            \item $\varepsilon_i \sim \mathcal{N}(0,\sigma^2)$, 
                $(\varepsilon_i)_{i\in \mathcal{N}}$ are mutually independent.
            \item prior knowledge of $\beta$: $\beta\sim\mathcal{N}(0,\kappa I_d)$
            \item $\mu = \frac{\kappa}{\sigma^2}$ is the regularization factor.
            \item after the variables $y_i$ are disclosed, the experimenter
                does \emph{maximum a posteriori estimation} which is equivalent
                under Gaussian prior to solving the ridge regression
                maximisation problem:
                \begin{displaymath}
                    \beta_{\text{max}} = \argmax \sum_i (y_i - \beta^*x_i)^2
                    + \frac{1}{\mu}\sum_i \norm{\beta}_2^2
                \end{displaymath}
        \end{itemize}
\end{itemize}

\subsection{Economics}

Value function: following the experiment design theory, the value of
data is the decrease of uncertainty about the model. Common measure of
uncertainty, the entropy. Thus, the value of a set $S$ of users is:
\begin{displaymath}
    V(S) = \mathbb{H}(\beta) - \mathbb{H}(\beta|Y_S)
\end{displaymath}
where $Y_S = \{y_i,\,i\in S\}$ is the set of observations.

In our case (under Gaussian prior):
\begin{align*}
    \forall S\subset\mathcal{N},\; V(S)
    & = \frac{1}{2}\log\det\left(I_d
        + \mu\sum_{i\in S} x_ix_i^*\right)\\
    & \defeq \frac{1}{2}\log\det A(S)
\end{align*}

\begin{lemma}[Marginal contribution]
    \begin{displaymath}
        \Delta_i V(S)\defeq V(S\cup\{i\}) - V(S) 
        = \frac{1}{2}\log\left(1 + \mu x_i^*A(S)^{-1}x_i\right)
    \end{displaymath}
\end{lemma}

\begin{proof}
    We have:
    \begin{align*}
        V(S\cup\{i\}) & = \frac{1}{2}\log\det A(S\cup\{i\})\\
        & = \frac{1}{2}\log\det\left(A(S) + \mu x_i x_i^*\right)\\
        & = V(S) + \frac{1}{2}\log\det\left(I_d + \mu A(S)^{-1}x_i
    x_i^*\right)\\
        & = V(S) + \frac{1}{2}\log\left(1 + \mu x_i^* A(S)^{-1}x_i\right)
    \end{align*}
    where the last equality comes from Sylvester's determinant formula.
\end{proof}
\begin{itemize}
    \item each user $i$ has a cost $c_i$
    \item the auctioneer has a budget constraint $B$
    \item optimisation problem:
        \begin{displaymath}
            OPT(V,\mathcal{N}, B) = \max_{S\subset\mathcal{N}} \left\{ V(S)\,|\,
            \sum_{i\in S}c_i\leq B\right\}
        \end{displaymath}
\end{itemize}

Problem, the costs are reported by users, they can lie. It is an auction
problem. Goal design a mechanism (selection and payment scheme) which is:
\begin{itemize}
    \item truthful
    \item individually rational
    \item budget feasible
    \item has a good approximation ratio
\end{itemize}

\section{Main result}

\begin{algorithm}\label{mechanism}
    \caption{Mechanism for ridge regression}
    \begin{algorithmic}[1]
    \State $i^* \gets \argmax_{j\in\mathcal{N}}V(j)$
    \State $x^* \gets \argmax_{x\in[0,1]^n} \{L_{\mathcal{N}\setminus\{i^*\}}(x)
                                    \,|\, c(x)\leq B\}$
        \Statex
        \If{$L(x^*) < CV(i^*)$}
            \State \textbf{return} $\{i^*\}$
        \Else
            \State $i \gets \argmax_{1\leq j\leq n}\frac{V(j)}{c_j}$
            \State $S \gets \emptyset$
            \While{$c_i\leq \frac{B}{2}\frac{V(S\cup\{i\})-V(S)}{V(S\cup\{i\})}$}
                \State $S \gets S\cup\{i\}$
                \State $i \gets \argmax_{j\in\mathcal{N}\setminus S}
                \frac{V(S\cup\{j\})-V(S)}{c_j}$
            \EndWhile
            \State \textbf{return} $S$
        \EndIf
    \end{algorithmic}
\end{algorithm}

Choosing $C=...$ we have:

\begin{theorem}
    The mechanism is truthful, individually rational, budget feasible.
    Furthermore, by choosing $C= ...$ we get an approximation ratio of:
    \begin{multline*}
        1 + C^* = \frac{5e-1 + C_\mu(4e-1)}{2C_\mu(e-1)}\\
        + \frac{\sqrt{C_\mu^2(1+2e)^2
        + 2C_\mu(14e^2+5e+1) + (1-5e)^2}}{2C_\mu(e-1)}
    \end{multline*}
\end{theorem}

\section{General setup}

\section{Conclusion}

\end{document}