1 files changed, 194 insertions, 154 deletions
diff --git a/slides/slides.tex b/slides/slides.tex
index ec2e5be..11ec280 100644
--- a/slides/slides.tex
+++ b/slides/slides.tex
@@ -2,18 +2,14 @@
 \usepackage[utf8x]{inputenc}
 \usepackage[greek,english]{babel}
 \usepackage[LGR,T1]{fontenc}
-\usepackage{lmodern}
 \usepackage{amsmath}
-\usepackage{graphicx}
-\usepackage{tikz}
-\newcommand{\tr}[1]{#1^*}
-\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]}
-\newcommand{\mse}{\mathop{\mathrm{MSE}}}
+\usepackage{algpseudocode,algorithm}
+\DeclareMathOperator*{\argmax}{arg\,max}
 \usetheme{Boadilla}
 \usecolortheme{beaver}
-\title[Data monetization]{Data monetization: pricing user data\\with the Shapley Value}
-\author{Thibaut \textsc{Horel}}
-\institute[Technicolor]{Work with {\greektext Στρατής \textsc{Ιωαννίδης}}}
+\title[Budget Feasible Mechanisms for Experimental Design]{Budget Feasible Mechanisms for Experimental Design}
+\author{xx\\ Joint work with yy and zz}
+\institute[]{}
 \setbeamercovered{transparent}
 
 \AtBeginSection[]
@@ -30,192 +26,236 @@
 \maketitle
 \end{frame}
 
-\section{Problem}
-\begin{frame}{Problem Overview}
-\begin{center}
-\includegraphics[width=0.8\textwidth]{schema.pdf}
-\end{center}
+\begin{frame}
+\tableofcontents
 \end{frame}
 
-\begin{frame}{Motivation}
-
-Why is it useful?
-
-\begin{itemize}
-\item sell/buy user data
-\item compensate users in your database
-\end{itemize}
-
-\end{frame}
+\section{Experimental Design}
+\begin{frame}{Experimental Design}
+    An experimenter is about to conduct an experiment with users. 
+    \begin{itemize}
+        \item $n$ users
+        \item $x_i\in\mathbb{R}^d$: public features of user $i$
+        \item $c_i\in\mathbb{R}^+$: cost of user $i$ for taking part in the experiment
+    \end{itemize}
 
-\begin{frame}{Challenges}
+    When paid $c_i$, user $i$ reveals $y_i$:
+    \begin{displaymath}
+        y_i = \beta^T x_i + \varepsilon_i,\quad\beta\in\mathbb{R}^d,\;
+        \varepsilon_i\sim\mathcal{N}(0,\sigma^2)
+    \end{displaymath}
+    $\beta$ is \alert{unknown} to the experimenter.
 
-We need to understand:
-\begin{itemize}
-\item the recommender system
-\item the revenue structure
-\item the relation between the revenue and the value of the database
-\item how to compensate individual users
-\end{itemize}
+    \begin{block}{}
+    \alert{Question:} budget constraint $B$, which users to pick to learn $\beta$ as
+    well as possible without spending more than $B$?
+    \end{block}
 \end{frame}
 
-\section{Shapley value}
+\begin{frame}{Evaluation criterion}
+    \begin{block}{}
+        \alert{Question:} How to measure the helpfulness of a set of users $S$ in learning $\beta$?
+    \end{block}
+    
+    \vspace{0.5cm}
 
-\begin{frame}{Individual value}
-  \begin{itemize}
-  \item $S$: set of players
-  \item $S'\subset S$: coalition
-  \item $V: \mathcal{P}(S)\to \mathbf{R}$: \alert{value} function:
+    Bayesian approach:
     \begin{itemize}
-    \item $V(\emptyset) = 0$
-    \item $V(S')$: value of the coalition $S'$
+        \item \emph{a priori} knowledge: prior distribution over $\beta$
+        \item the entropy of $\beta$, $H(\beta)$ measures the uncertainty of the experimenter
+        \item after observing $Y_S = \{y_i,\; i\in S\}$, uncertainty becomes $H(\beta\mid Y_S)$
     \end{itemize}
-  \end{itemize}
 
-  \alert{Question:} How to split the value of a subset across all its constituents?
+    \vspace{0.5cm}
 
-  \alert{Goal:} $\phi_i(S',V)$: value of player $i$ in $S'$
+    The value of $S$ is the \alert{entropy reduction}:
+    \begin{displaymath}
+        V(S) = H(\beta) - H(\beta\mid Y_S)
+    \end{displaymath}
 \end{frame}
 
-\begin{frame}{Solution attempts}
-  \begin{itemize}
-    \visible<1->{\item $\phi_i(S',V) = V(\{i\})$?} \visible<2->{``The whole is greater than the sum of its parts''
-      $$V(S')\neq \sum_{i\in S'} V(\{i\})$$}
-    \visible<3->{\item $\phi_i(S',V) = V(S') - V(S'\setminus\{i\})$: the \alert{marginal contribution}?} \visible<4->{depends on the time the player joins the coalition}
-  \end{itemize}
+\begin{frame}{Value function}
+    Gaussian linear model:
+    \begin{displaymath}
+        y_i = \beta^T x_i + \varepsilon_i,\quad\beta\in\mathbb{R}^d,\;
+        \varepsilon_i\sim\mathcal{N}(0,\sigma^2)
+    \end{displaymath}
+    
+    Typical prior distribution $\beta\sim\mathcal{N}(0,\tau I_d)$
+   
+    \vspace{0.5cm}
+
+    \begin{block}{}
+        Under the Gaussian linear model and Gaussian prior distribution:
+        \begin{displaymath}
+            V(S) = \log\det\Big(I_d + \lambda\sum_{i\in S}x_i x_i^T \Big)
+        \end{displaymath}
+    \end{block}
+
+    \vspace{0.5cm}
+
+    \alert{Remark:} $V$ can be computed efficiently using a Cholesky decomposition
 \end{frame}
 
-\begin{frame}{Axioms}
-  \begin{description}
-    \item[Efficiency:] No money lost in the splitting process
-      $$V(S) = \sum_{i\in S} \phi_i(S,V)$$
-\pause
-    \item[Symmetry:] Equal pay for equal contribution
+\section{Non-strategic case}
+\begin{frame}{Optimization Problem}
+    \begin{block}{P1}
+        \begin{align*}
+            &\text{maximize}\; V(S) = \log\det\Big(I_d + \lambda\sum_{i\in S}x_i x_i^T \Big)\\
+            &\text{subject to}\; \sum_{i\in S}c_i\leq B\quad\text{(Knapsack constraint)}
+        \end{align*}
+    \end{block}
+
+    \onslide<2->
+    Good news:
+
+    $V$ is submodular (diminishing return):
+    \begin{displaymath}
+        \forall S\subseteq T\, \forall i,\; V(S\cup\{i\}) - V(S)\geq
+        V(T\cup\{i\}) - V(T)
+    \end{displaymath}
 
-      \alert{if} $\forall S'\subset S\setminus\{i,j\}, V(S'\cup\{i\}) = V(S'\cup\{j\})$
+    \onslide<3->
+    Bad news:
 
-      \alert{then} $\phi_i(S,V) = \phi_j(S,V)$
-\pause
-    \item[Fairness:] $j$'s contribution to $i$ equals $i$'s contribution to $j$
-      $$\phi_i(S,V) - \phi_i(S\setminus\{j\}) = \phi_j(S,V)- \phi_j(S\setminus\{i\})$$
-  \end{description}
+    Maximizing a submodular function under a knapsack constraint is NP-hard (even
+    in the specific case of $\log\det$)
 \end{frame}
 
-\begin{frame}{Solution: the Shapley Value}
+\begin{frame}{Approximation}
+    \begin{definition}[Approximation ratio]
+        Let $OPT$ denote the optimal solution to P1, $S$ is an $\alpha$-approximation
+        if:
+        \begin{displaymath}
+            V(OPT)\leq \alpha V(S)
+        \end{displaymath}
+    \end{definition}
+
+    \vspace{0.5cm}
+
+    \begin{theorem}[Sviridenko]
+        \begin{enumerate}
+            \item There is an algorithm which returns a set $S$ which is a $\frac{e}{e-1}$-approximation
+            \item No algorithm can do better unless P=NP
+        \end{enumerate}
+    \end{theorem}
+
+    \vspace{0.5cm}
 
-  \begin{theorem}[Shapley, 1953]
-    There is only one function satisfying Efficiency, Symmetry and
-    Fairness:
-    $$\phi_i(S,V) = \frac{1}{\vert S\vert!}\sum_{\pi\in\mathfrak{S}(S)}\Delta_i\big(V,S(i,\pi)\big)$$
-  \end{theorem}
-  \begin{itemize}
-  \item $\mathfrak{S}(S)$: set of permutations of $S$
-  \item $S(i,\pi)$: set of players before $i$ in $\pi$
-  \item $\Delta_i\big(V,T\big) =
-    V\big(T\big)-V\big(T\setminus\{i\})$: marginal contribution of $i$
-    to $T$
-  \end{itemize}
+    The algorithm is a combination of an enumeration method and a greedy heuristic.
 \end{frame}
 
-\begin{frame}{Properties}
-  \begin{itemize}
-  \item
-    \alert{if} $V$ is superadditive: $V(S\cup T) \geq V(S) + V(T)$
-\pause
+\begin{frame}{Greedy algorithm}
+    \begin{block}{Greedy($V$, $B$)}  
+        \begin{algorithmic}
+            \State $i \gets \argmax_{1\leq j\leq n}\frac{V(j)}{c_j}$
+            \State $S \gets \emptyset$
+            \While{$c_i\leq B - \sum_{j\in S} c_j$}
+                \State $S \gets S\cup\{i\}$
+                \State $i \gets \argmax_{1\leq j\leq n}
+                \frac{V(S\cup\{j\})-V(S)}{c_j}$
+            \EndWhile
+            \State \textbf{return} $S$
+        \end{algorithmic}
+    \end{block}
 
-    \alert{then} the Shapley Value is individually rational:
-    $\phi_i(S,V) > V(\{i\})$ 
-\pause
-  \item 
-    \alert{if} $V$ is supermodular: $\Delta_i(V,S)\leq\Delta_i(V,T),
-    \forall S\subset T$
-\pause
+    \vspace{0.5cm}
 
-    \alert{then} the Shapley Value lies in the core of
-    the game:
-    $$\sum_{i\in S'}\phi_i(S,V) \geq V(S'), \forall S'\subset S$$
-\pause
-  \item Hervé Moulin \& Scott Shenker (1999): Group-strategyproof auction
-  \end{itemize}
+    \begin{lemma}
+        Greedy has an \alert{unbounded} approximation ratio.
+    \end{lemma}
 \end{frame}
 
-\section{Example scenario}
+\begin{frame}
+    \begin{block}{MaxGreedy}
+        \begin{algorithmic}
+            \State $i^* \gets \argmax_{1\leq j\leq n} V(j)$
+            \State $S \gets$ Greedy($V$, $B$)
+            \If{$V(i^*)\geq V(S)$}
+                \State \textbf{return} $\{i^*\}$
+            \Else
+                \State \textbf{return} $S$
+            \EndIf
+        \end{algorithmic}
+    \end{block}
 
-\begin{frame}{Setup}
-  \begin{itemize}
-  \item  database $X$: $n\times p$ matrix
- 
-  \item $x_i\in \mathbf{R}^p$ $i$-th row of $X$: data of user $i$
+    \vspace{0.5cm}
 
-  \item for each user: $y_i$ variable of interest, could be:
-    \begin{itemize}
-    \item already observed
-    \item about to be observed 
-    \end{itemize}
-  \end{itemize}
+    \begin{lemma}
+        MaxGreedy is a $\frac{5e}{e-1}$ approximation
+    \end{lemma}
 \end{frame}
 
-\begin{frame}{Recommender system}
-\alert{Goal:} predict the variable of interest $y$ of a new user $x$
+\section{Strategic case}
 
-\alert{Linear model:} relation between data and variable of interest:
-$$y = \langle\beta,x\rangle + \varepsilon, \quad \varepsilon\simeq\mathcal{N}(0,\sigma^2)$$
+\begin{frame}{Reverse auction}
 
-\alert{Linear regression:} estimator $\tilde{\beta} =
-(\tr{X}X)^{-1}\tr{X}Y$
+    \alert{Problem:} In real life, user are strategic agents. They lie about their costs.
+    This is a \alert{reverse auction}: the experimenter wants to buy data
+    from users.
 
-\alert{Prediction:} $\tilde{y} = \langle\tilde{\beta},x\rangle$
-\end{frame}
+    \vspace{0.5cm}
+
+    \alert{Goal:} Design an auction mechanism:
+    \begin{itemize}
+        \item a selection rule $s: (c_1,\ldots,c_n) \mapsto (s_1,\ldots,s_n)$,
+            $s_i\in\{0,1\}$
+        \item a payment rule $p: (c_1,\ldots,c_n) \mapsto (p_1,\ldots,p_n)$,
+            $p_i\in\mathbb{R}^+$
+    \end{itemize}
 
-\begin{frame}{Revenue structure}
-\alert{Prediction error:} $\mse(X,x) = \expt{(\tilde{y} - y)^2} =
-\sigma^2\tr{x}(\tr{X}X)^{-1}x + \sigma^2$  
-\vspace{1cm}
-\begin{block}{Property}
-  \emph{The prediction error decreases as the size of the database increases}
-\end{block}
-\vspace{1cm}
-\alert{$\Rightarrow$} choose a decreasing function of the prediction
-error
-\pause
+    \vspace{0.5cm}
 
-\alert{Problem:} $\mse$ not submodular
+    As usual in auction mechanism design, we want payments to be:
+    \begin{itemize}
+        \item individually rational: $p_i\geq c_i$
+        \item truthful: reporting one's true cost is a dominant strategy
+        \item normalized: $s_i=0$ implies $p_i=0$
+    \end{itemize}
 \end{frame}
 
-\begin{frame}{Revenue structure (2)}
-Exponentially decreasing revenue:
-$$V(X,x) = \exp\big(-\mse(X,x)\big)$$
+\begin{frame}{Optimization problem for budget feasible auctions}
+    \begin{block}{P2}
+        \begin{align*}
+            &\text{maximize}\; V\big(s(c_1,\ldots,c_n)\big) = \log\det\Big(I_d + \lambda\sum_{i\in S}x_i x_i^T \Big)\\
+            &\text{subject to}\; \sum_{i\in S}\alert{p_i}\leq B\quad\text{(budget feasible)}
+        \end{align*}
+        where the payments are further constrained to be individually rational, truthful
+        and normalized
+    \end{block}
 
-Value of database $X$:
-$$V(X) = \int_{x\in\mathbb{R}^d} V(X,x) dx$$
+    \vspace{1cm}
+
+    This is again NP-hard $\Rightarrow$ we seek good approximation ratios
 \end{frame}
 
-\begin{frame}{Properties}
-\begin{block}{Closed-form expression}
-$$V(X) = C\sqrt{\det(\tr{X}X)}$$ 
-\end{block}
+\begin{frame}{Myerson's theorem}
+    \begin{theorem}[Myerson]
+        Let $(s, p)$ be a mechanism. If:
+        \begin{itemize}
+            \item $s$ is \alert{monotonic}: if $c_i\leq c_i'$ then $s_i \geq s_i'$
+            \item users are paid \alert{threshold payments}: $p_i = \inf\{c_i':i\notin s(c_i',c_{-i})\}$
+        \end{itemize}
+        Then, the mechanism is individually rational and truthful.
+    \end{theorem}
 
-\vspace{1cm}
-\pause
-\begin{block}{Supermodularity}
-$$X\mapsto\log\big(V(X)\big)$$ is supermodular
-\end{block}
-\pause
-\alert{$\Rightarrow$} we can apply the Shapley value theory!
-\end{frame}
+    \vspace{1cm}
 
-\begin{frame}{Conclusion}
+    \alert{Consequence:} It suffices to find a monotonic selection rule such that
+    the threshold payments are within the budget constraint.
+\end{frame}
 
-\begin{itemize}
-\item general framework to study the pricing problem
-\item the case of the linear regression
-\end{itemize}
-\vspace{1cm}
-\pause
-Future directions
-\begin{itemize}
-\item different revenue models
-\item fluid Shapley Value (Aumann-Shapley) for a continuum of players
-\end{itemize}
+\begin{frame}{Main result}
+    \begin{theorem}
+        There is a budget feasible, individually rational and truthful mechanism for
+        the experimental design problem which runs in polynomial time. Its
+        approximation ratio is:
+        \begin{displaymath}
+        \frac{10e-3 + \sqrt{64e^2-24e + 9}}{2(e-1)}
+        \simeq 12.98 
+        \end{displaymath}
+    \end{theorem}
 \end{frame}
-\end{document}
-\ No newline at end of file
+\end{document}
+
+