diff options
Diffstat (limited to 'slides')
| -rw-r--r-- | slides/slides.tex | 348 |
1 files changed, 194 insertions, 154 deletions
diff --git a/slides/slides.tex b/slides/slides.tex index ec2e5be..11ec280 100644 --- a/slides/slides.tex +++ b/slides/slides.tex @@ -2,18 +2,14 @@ \usepackage[utf8x]{inputenc} \usepackage[greek,english]{babel} \usepackage[LGR,T1]{fontenc} -\usepackage{lmodern} \usepackage{amsmath} -\usepackage{graphicx} -\usepackage{tikz} -\newcommand{\tr}[1]{#1^*} -\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} -\newcommand{\mse}{\mathop{\mathrm{MSE}}} +\usepackage{algpseudocode,algorithm} +\DeclareMathOperator*{\argmax}{arg\,max} \usetheme{Boadilla} \usecolortheme{beaver} -\title[Data monetization]{Data monetization: pricing user data\\with the Shapley Value} -\author{Thibaut \textsc{Horel}} -\institute[Technicolor]{Work with {\greektext Στρατής \textsc{Ιωαννίδης}}} +\title[Budget Feasible Mechanisms for Experimental Design]{Budget Feasible Mechanisms for Experimental Design} +\author{xx\\ Joint work with yy and zz} +\institute[]{} \setbeamercovered{transparent} \AtBeginSection[] @@ -30,192 +26,236 @@ \maketitle \end{frame} -\section{Problem} -\begin{frame}{Problem Overview} -\begin{center} -\includegraphics[width=0.8\textwidth]{schema.pdf} -\end{center} +\begin{frame} +\tableofcontents \end{frame} -\begin{frame}{Motivation} - -Why is it useful? - -\begin{itemize} -\item sell/buy user data -\item compensate users in your database -\end{itemize} - -\end{frame} +\section{Experimental Design} +\begin{frame}{Experimental Design} + An experimenter is about to conduct an experiment with users. + \begin{itemize} + \item $n$ users + \item $x_i\in\mathbb{R}^d$: public features of user $i$ + \item $c_i\in\mathbb{R}^+$: cost of user $i$ for taking part in the experiment + \end{itemize} -\begin{frame}{Challenges} + When paid $c_i$, user $i$ reveals $y_i$: + \begin{displaymath} + y_i = \beta^T x_i + \varepsilon_i,\quad\beta\in\mathbb{R}^d,\; + \varepsilon_i\sim\mathcal{N}(0,\sigma^2) + \end{displaymath} + $\beta$ is \alert{unknown} to the experimenter. -We need to understand: -\begin{itemize} -\item the recommender system -\item the revenue structure -\item the relation between the revenue and the value of the database -\item how to compensate individual users -\end{itemize} + \begin{block}{} + \alert{Question:} budget constraint $B$, which users to pick to learn $\beta$ as + well as possible without spending more than $B$? + \end{block} \end{frame} -\section{Shapley value} +\begin{frame}{Evaluation criterion} + \begin{block}{} + \alert{Question:} How to measure the helpfulness of a set of users $S$ in learning $\beta$? + \end{block} + + \vspace{0.5cm} -\begin{frame}{Individual value} - \begin{itemize} - \item $S$: set of players - \item $S'\subset S$: coalition - \item $V: \mathcal{P}(S)\to \mathbf{R}$: \alert{value} function: + Bayesian approach: \begin{itemize} - \item $V(\emptyset) = 0$ - \item $V(S')$: value of the coalition $S'$ + \item \emph{a priori} knowledge: prior distribution over $\beta$ + \item the entropy of $\beta$, $H(\beta)$ measures the uncertainty of the experimenter + \item after observing $Y_S = \{y_i,\; i\in S\}$, uncertainty becomes $H(\beta\mid Y_S)$ \end{itemize} - \end{itemize} - \alert{Question:} How to split the value of a subset across all its constituents? + \vspace{0.5cm} - \alert{Goal:} $\phi_i(S',V)$: value of player $i$ in $S'$ + The value of $S$ is the \alert{entropy reduction}: + \begin{displaymath} + V(S) = H(\beta) - H(\beta\mid Y_S) + \end{displaymath} \end{frame} -\begin{frame}{Solution attempts} - \begin{itemize} - \visible<1->{\item $\phi_i(S',V) = V(\{i\})$?} \visible<2->{``The whole is greater than the sum of its parts'' - $$V(S')\neq \sum_{i\in S'} V(\{i\})$$} - \visible<3->{\item $\phi_i(S',V) = V(S') - V(S'\setminus\{i\})$: the \alert{marginal contribution}?} \visible<4->{depends on the time the player joins the coalition} - \end{itemize} +\begin{frame}{Value function} + Gaussian linear model: + \begin{displaymath} + y_i = \beta^T x_i + \varepsilon_i,\quad\beta\in\mathbb{R}^d,\; + \varepsilon_i\sim\mathcal{N}(0,\sigma^2) + \end{displaymath} + + Typical prior distribution $\beta\sim\mathcal{N}(0,\tau I_d)$ + + \vspace{0.5cm} + + \begin{block}{} + Under the Gaussian linear model and Gaussian prior distribution: + \begin{displaymath} + V(S) = \log\det\Big(I_d + \lambda\sum_{i\in S}x_i x_i^T \Big) + \end{displaymath} + \end{block} + + \vspace{0.5cm} + + \alert{Remark:} $V$ can be computed efficiently using a Cholesky decomposition \end{frame} -\begin{frame}{Axioms} - \begin{description} - \item[Efficiency:] No money lost in the splitting process - $$V(S) = \sum_{i\in S} \phi_i(S,V)$$ -\pause - \item[Symmetry:] Equal pay for equal contribution +\section{Non-strategic case} +\begin{frame}{Optimization Problem} + \begin{block}{P1} + \begin{align*} + &\text{maximize}\; V(S) = \log\det\Big(I_d + \lambda\sum_{i\in S}x_i x_i^T \Big)\\ + &\text{subject to}\; \sum_{i\in S}c_i\leq B\quad\text{(Knapsack constraint)} + \end{align*} + \end{block} + + \onslide<2-> + Good news: + + $V$ is submodular (diminishing return): + \begin{displaymath} + \forall S\subseteq T\, \forall i,\; V(S\cup\{i\}) - V(S)\geq + V(T\cup\{i\}) - V(T) + \end{displaymath} - \alert{if} $\forall S'\subset S\setminus\{i,j\}, V(S'\cup\{i\}) = V(S'\cup\{j\})$ + \onslide<3-> + Bad news: - \alert{then} $\phi_i(S,V) = \phi_j(S,V)$ -\pause - \item[Fairness:] $j$'s contribution to $i$ equals $i$'s contribution to $j$ - $$\phi_i(S,V) - \phi_i(S\setminus\{j\}) = \phi_j(S,V)- \phi_j(S\setminus\{i\})$$ - \end{description} + Maximizing a submodular function under a knapsack constraint is NP-hard (even + in the specific case of $\log\det$) \end{frame} -\begin{frame}{Solution: the Shapley Value} +\begin{frame}{Approximation} + \begin{definition}[Approximation ratio] + Let $OPT$ denote the optimal solution to P1, $S$ is an $\alpha$-approximation + if: + \begin{displaymath} + V(OPT)\leq \alpha V(S) + \end{displaymath} + \end{definition} + + \vspace{0.5cm} + + \begin{theorem}[Sviridenko] + \begin{enumerate} + \item There is an algorithm which returns a set $S$ which is a $\frac{e}{e-1}$-approximation + \item No algorithm can do better unless P=NP + \end{enumerate} + \end{theorem} + + \vspace{0.5cm} - \begin{theorem}[Shapley, 1953] - There is only one function satisfying Efficiency, Symmetry and - Fairness: - $$\phi_i(S,V) = \frac{1}{\vert S\vert!}\sum_{\pi\in\mathfrak{S}(S)}\Delta_i\big(V,S(i,\pi)\big)$$ - \end{theorem} - \begin{itemize} - \item $\mathfrak{S}(S)$: set of permutations of $S$ - \item $S(i,\pi)$: set of players before $i$ in $\pi$ - \item $\Delta_i\big(V,T\big) = - V\big(T\big)-V\big(T\setminus\{i\})$: marginal contribution of $i$ - to $T$ - \end{itemize} + The algorithm is a combination of an enumeration method and a greedy heuristic. \end{frame} -\begin{frame}{Properties} - \begin{itemize} - \item - \alert{if} $V$ is superadditive: $V(S\cup T) \geq V(S) + V(T)$ -\pause +\begin{frame}{Greedy algorithm} + \begin{block}{Greedy($V$, $B$)} + \begin{algorithmic} + \State $i \gets \argmax_{1\leq j\leq n}\frac{V(j)}{c_j}$ + \State $S \gets \emptyset$ + \While{$c_i\leq B - \sum_{j\in S} c_j$} + \State $S \gets S\cup\{i\}$ + \State $i \gets \argmax_{1\leq j\leq n} + \frac{V(S\cup\{j\})-V(S)}{c_j}$ + \EndWhile + \State \textbf{return} $S$ + \end{algorithmic} + \end{block} - \alert{then} the Shapley Value is individually rational: - $\phi_i(S,V) > V(\{i\})$ -\pause - \item - \alert{if} $V$ is supermodular: $\Delta_i(V,S)\leq\Delta_i(V,T), - \forall S\subset T$ -\pause + \vspace{0.5cm} - \alert{then} the Shapley Value lies in the core of - the game: - $$\sum_{i\in S'}\phi_i(S,V) \geq V(S'), \forall S'\subset S$$ -\pause - \item Hervé Moulin \& Scott Shenker (1999): Group-strategyproof auction - \end{itemize} + \begin{lemma} + Greedy has an \alert{unbounded} approximation ratio. + \end{lemma} \end{frame} -\section{Example scenario} +\begin{frame} + \begin{block}{MaxGreedy} + \begin{algorithmic} + \State $i^* \gets \argmax_{1\leq j\leq n} V(j)$ + \State $S \gets$ Greedy($V$, $B$) + \If{$V(i^*)\geq V(S)$} + \State \textbf{return} $\{i^*\}$ + \Else + \State \textbf{return} $S$ + \EndIf + \end{algorithmic} + \end{block} -\begin{frame}{Setup} - \begin{itemize} - \item database $X$: $n\times p$ matrix - - \item $x_i\in \mathbf{R}^p$ $i$-th row of $X$: data of user $i$ + \vspace{0.5cm} - \item for each user: $y_i$ variable of interest, could be: - \begin{itemize} - \item already observed - \item about to be observed - \end{itemize} - \end{itemize} + \begin{lemma} + MaxGreedy is a $\frac{5e}{e-1}$ approximation + \end{lemma} \end{frame} -\begin{frame}{Recommender system} -\alert{Goal:} predict the variable of interest $y$ of a new user $x$ +\section{Strategic case} -\alert{Linear model:} relation between data and variable of interest: -$$y = \langle\beta,x\rangle + \varepsilon, \quad \varepsilon\simeq\mathcal{N}(0,\sigma^2)$$ +\begin{frame}{Reverse auction} -\alert{Linear regression:} estimator $\tilde{\beta} = -(\tr{X}X)^{-1}\tr{X}Y$ + \alert{Problem:} In real life, user are strategic agents. They lie about their costs. + This is a \alert{reverse auction}: the experimenter wants to buy data + from users. -\alert{Prediction:} $\tilde{y} = \langle\tilde{\beta},x\rangle$ -\end{frame} + \vspace{0.5cm} + + \alert{Goal:} Design an auction mechanism: + \begin{itemize} + \item a selection rule $s: (c_1,\ldots,c_n) \mapsto (s_1,\ldots,s_n)$, + $s_i\in\{0,1\}$ + \item a payment rule $p: (c_1,\ldots,c_n) \mapsto (p_1,\ldots,p_n)$, + $p_i\in\mathbb{R}^+$ + \end{itemize} -\begin{frame}{Revenue structure} -\alert{Prediction error:} $\mse(X,x) = \expt{(\tilde{y} - y)^2} = -\sigma^2\tr{x}(\tr{X}X)^{-1}x + \sigma^2$ -\vspace{1cm} -\begin{block}{Property} - \emph{The prediction error decreases as the size of the database increases} -\end{block} -\vspace{1cm} -\alert{$\Rightarrow$} choose a decreasing function of the prediction -error -\pause + \vspace{0.5cm} -\alert{Problem:} $\mse$ not submodular + As usual in auction mechanism design, we want payments to be: + \begin{itemize} + \item individually rational: $p_i\geq c_i$ + \item truthful: reporting one's true cost is a dominant strategy + \item normalized: $s_i=0$ implies $p_i=0$ + \end{itemize} \end{frame} -\begin{frame}{Revenue structure (2)} -Exponentially decreasing revenue: -$$V(X,x) = \exp\big(-\mse(X,x)\big)$$ +\begin{frame}{Optimization problem for budget feasible auctions} + \begin{block}{P2} + \begin{align*} + &\text{maximize}\; V\big(s(c_1,\ldots,c_n)\big) = \log\det\Big(I_d + \lambda\sum_{i\in S}x_i x_i^T \Big)\\ + &\text{subject to}\; \sum_{i\in S}\alert{p_i}\leq B\quad\text{(budget feasible)} + \end{align*} + where the payments are further constrained to be individually rational, truthful + and normalized + \end{block} -Value of database $X$: -$$V(X) = \int_{x\in\mathbb{R}^d} V(X,x) dx$$ + \vspace{1cm} + + This is again NP-hard $\Rightarrow$ we seek good approximation ratios \end{frame} -\begin{frame}{Properties} -\begin{block}{Closed-form expression} -$$V(X) = C\sqrt{\det(\tr{X}X)}$$ -\end{block} +\begin{frame}{Myerson's theorem} + \begin{theorem}[Myerson] + Let $(s, p)$ be a mechanism. If: + \begin{itemize} + \item $s$ is \alert{monotonic}: if $c_i\leq c_i'$ then $s_i \geq s_i'$ + \item users are paid \alert{threshold payments}: $p_i = \inf\{c_i':i\notin s(c_i',c_{-i})\}$ + \end{itemize} + Then, the mechanism is individually rational and truthful. + \end{theorem} -\vspace{1cm} -\pause -\begin{block}{Supermodularity} -$$X\mapsto\log\big(V(X)\big)$$ is supermodular -\end{block} -\pause -\alert{$\Rightarrow$} we can apply the Shapley value theory! -\end{frame} + \vspace{1cm} -\begin{frame}{Conclusion} + \alert{Consequence:} It suffices to find a monotonic selection rule such that + the threshold payments are within the budget constraint. +\end{frame} -\begin{itemize} -\item general framework to study the pricing problem -\item the case of the linear regression -\end{itemize} -\vspace{1cm} -\pause -Future directions -\begin{itemize} -\item different revenue models -\item fluid Shapley Value (Aumann-Shapley) for a continuum of players -\end{itemize} +\begin{frame}{Main result} + \begin{theorem} + There is a budget feasible, individually rational and truthful mechanism for + the experimental design problem which runs in polynomial time. Its + approximation ratio is: + \begin{displaymath} + \frac{10e-3 + \sqrt{64e^2-24e + 9}}{2(e-1)} + \simeq 12.98 + \end{displaymath} + \end{theorem} \end{frame} -\end{document}
\ No newline at end of file +\end{document} + + |
