\documentclass{beamer} \usepackage[utf8x]{inputenc} \usepackage[greek,english]{babel} \usepackage[LGR,T1]{fontenc} \usepackage{lmodern} \usepackage{amsmath} \usepackage{graphicx} \usepackage{tikz} \newcommand{\tr}[1]{#1^*} \newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} \newcommand{\mse}{\mathop{\mathrm{MSE}}} \usetheme{Boadilla} \usecolortheme{beaver} \title[Data monetization]{Data monetization: pricing user data\\with the Shapley Value} \author{Thibaut \textsc{Horel}} \institute[Technicolor]{Work with {\greektext Στρατής \textsc{Ιωαννίδης}}} \setbeamercovered{transparent} \AtBeginSection[] { \begin{frame} \frametitle{Outline} \tableofcontents[currentsection] \end{frame} } \begin{document} \begin{frame} \maketitle \end{frame} \section{Problem} \begin{frame}{Problem Overview} \begin{center} \includegraphics[width=0.8\textwidth]{schema.pdf} \end{center} \end{frame} \begin{frame}{Motivation} Why is it useful? \begin{itemize} \item sell/buy user data \item compensate users in your database \end{itemize} \end{frame} \begin{frame}{Challenges} We need to understand: \begin{itemize} \item the recommender system \item the revenue structure \item the relation between the revenue and the value of the database \item how to compensate individual users \end{itemize} \end{frame} \section{Shapley value} \begin{frame}{Individual value} \begin{itemize} \item $S$: set of players \item $S'\subset S$: coalition \item $V: \mathcal{P}(S)\to \mathbf{R}$: \alert{value} function: \begin{itemize} \item $V(\emptyset) = 0$ \item $V(S')$: value of the coalition $S'$ \end{itemize} \end{itemize} \alert{Question:} How to split the value of a subset across all its constituents? \alert{Goal:} $\phi_i(S',V)$: value of player $i$ in $S'$ \end{frame} \begin{frame}{Solution attempts} \begin{itemize} \visible<1->{\item $\phi_i(S',V) = V(\{i\})$?} \visible<2->{``The whole is greater than the sum of its parts'' $$V(S')\neq \sum_{i\in S'} V(\{i\})$$} \visible<3->{\item $\phi_i(S',V) = V(S') - V(S'\setminus\{i\})$: the \alert{marginal contribution}?} \visible<4->{depends on the time the player joins the coalition} \end{itemize} \end{frame} \begin{frame}{Axioms} \begin{description} \item[Efficiency:] No money lost in the splitting process $$V(S) = \sum_{i\in S} \phi_i(S,V)$$ \pause \item[Symmetry:] Equal pay for equal contribution \alert{if} $\forall S'\subset S\setminus\{i,j\}, V(S'\cup\{i\}) = V(S'\cup\{j\})$ \alert{then} $\phi_i(S,V) = \phi_j(S,V)$ \pause \item[Fairness:] $j$'s contribution to $i$ equals $i$'s contribution to $j$ $$\phi_i(S,V) - \phi_i(S\setminus\{j\}) = \phi_j(S,V)- \phi_j(S\setminus\{i\})$$ \end{description} \end{frame} \begin{frame}{Solution: the Shapley Value} \begin{theorem}[Shapley, 1953] There is only one function satisfying Efficiency, Symmetry and Fairness: $$\phi_i(S,V) = \frac{1}{\vert S\vert!}\sum_{\pi\in\mathfrak{S}(S)}\Delta_i\big(V,S(i,\pi)\big)$$ \end{theorem} \begin{itemize} \item $\mathfrak{S}(S)$: set of permutations of $S$ \item $S(i,\pi)$: set of players before $i$ in $\pi$ \item $\Delta_i\big(V,T\big) = V\big(T\big)-V\big(T\setminus\{i\})$: marginal contribution of $i$ to $T$ \end{itemize} \end{frame} \begin{frame}{Properties} \begin{itemize} \item \alert{if} $V$ is superadditive: $V(S\cup T) \geq V(S) + V(T)$ \pause \alert{then} the Shapley Value is individually rational: $\phi_i(S,V) > V(\{i\})$ \pause \item \alert{if} $V$ is supermodular: $\Delta_i(V,S)\leq\Delta_i(V,T), \forall S\subset T$ \pause \alert{then} the Shapley Value lies in the core of the game: $$\sum_{i\in S'}\phi_i(S,V) \geq V(S'), \forall S'\subset S$$ \pause \item Hervé Moulin \& Scott Shenker (1999): Group-strategyproof auction \end{itemize} \end{frame} \section{Example scenario} \begin{frame}{Setup} \begin{itemize} \item database $X$: $n\times p$ matrix \item $x_i\in \mathbf{R}^p$ $i$-th row of $X$: data of user $i$ \item for each user: $y_i$ variable of interest, could be: \begin{itemize} \item already observed \item about to be observed \end{itemize} \end{itemize} \end{frame} \begin{frame}{Recommender system} \alert{Goal:} predict the variable of interest $y$ of a new user $x$ \alert{Linear model:} relation between data and variable of interest: $$y = \langle\beta,x\rangle + \varepsilon, \quad \varepsilon\simeq\mathcal{N}(0,\sigma^2)$$ \alert{Linear regression:} estimator $\tilde{\beta} = (\tr{X}X)^{-1}\tr{X}Y$ \alert{Prediction:} $\tilde{y} = \langle\tilde{\beta},x\rangle$ \end{frame} \begin{frame}{Revenue structure} \alert{Prediction error:} $\mse(X,x) = \expt{(\tilde{y} - y)^2} = \sigma^2\tr{x}(\tr{X}X)^{-1}x + \sigma^2$ \vspace{1cm} \begin{block}{Property} \emph{The prediction error decreases as the size of the database increases} \end{block} \vspace{1cm} \alert{$\Rightarrow$} choose a decreasing function of the prediction error \pause \alert{Problem:} $\mse$ not submodular \end{frame} \begin{frame}{Revenue structure (2)} Exponentially decreasing revenue: $$V(X,x) = \exp\big(-\mse(X,x)\big)$$ Value of database $X$: $$V(X) = \int_{x\in\mathbb{R}^d} V(X,x) dx$$ \end{frame} \begin{frame}{Properties} \begin{block}{Closed-form expression} $$V(X) = C\sqrt{\det(\tr{X}X)}$$ \end{block} \vspace{1cm} \pause \begin{block}{Supermodularity} $$X\mapsto\log\big(V(X)\big)$$ is supermodular \end{block} \pause \alert{$\Rightarrow$} we can apply the Shapley value theory! \end{frame} \begin{frame}{Conclusion} \begin{itemize} \item general framework to study the pricing problem \item the case of the linear regression \end{itemize} \vspace{1cm} \pause Future directions \begin{itemize} \item different revenue models \item fluid Shapley Value (Aumann-Shapley) for a continuum of players \end{itemize} \end{frame} \end{document}