From d010cdae55a7e19d78a24e46c1db3058d69347fc Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Wed, 15 Aug 2012 18:27:30 -0700 Subject: Repository restructuration (has to be clean now that it is shared ;) --- notes.tex | 391 ++++++++++++------------------------------------------ notes2.tex | 162 ---------------------- old_notes.tex | 377 ++++++++++++++++++++++++++++++++++++++++++++++++++++ schema.pdf | Bin 844491 -> 0 bytes slides.tex | 221 ------------------------------ slides/schema.pdf | Bin 0 -> 844491 bytes slides/slides.tex | 221 ++++++++++++++++++++++++++++++ 7 files changed, 686 insertions(+), 686 deletions(-) delete mode 100644 notes2.tex create mode 100644 old_notes.tex delete mode 100644 schema.pdf delete mode 100644 slides.tex create mode 100644 slides/schema.pdf create mode 100644 slides/slides.tex diff --git a/notes.tex b/notes.tex index 8dc2c7b..92eabb5 100644 --- a/notes.tex +++ b/notes.tex @@ -3,6 +3,8 @@ \usepackage{amsmath,amsthm,amsfonts} \usepackage{comment} \newtheorem{lemma}{Lemma} +\newtheorem{fact}{Fact} +\newtheorem{example}{Example} \newtheorem{prop}{Proposition} \newcommand{\var}{\mathop{\mathrm{Var}}} \newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]} @@ -14,35 +16,102 @@ \newcommand{\trace}{\mathop{\mathrm{tr}}} \begin{document} +\section{Problem} + +\begin{itemize} +\item database $D = (X_i)_{1\leq i\leq n}$ +\item $(X_i)_{1\leq i\leq n}\in\big(\mathbf{R^d}\big)^n$ sampled in an i.i.d + fashion from the probability distribution $\mu$ +\end{itemize} + +We assume that there is a common variable $P$ which is the underlying +phenomenon that we are interesting in, and all the variables in the database, +are noisy realisations of this phenomenon: +\begin{displaymath} + X_i = P + N_i +\end{displaymath} +where the $N_i$ are pairwise independent and also independent from $H$. + + +We are interested in defining the $\emph{value}$ of a subset $S$ of the +database $D$. The approaches to such a definition can be classified in two main +families: +\begin{itemize} + \item \emph{Information theoretic:} We use the entropy of $P$ as a measure + of the uncertainty. Then the value of a subset $S$ is the decrease of + entropy induced by revealing it: + \begin{displaymath} + V(S) = H(P) - H(P\,|\,S) + \end{displaymath} + \item \emph{Statistical:} in this case you are trying to estimate the + phenomenon. The variance of the estimation is used as a measure of + the uncertainty. The value of a set $S$ can be defined as the average + reduction of the variance over all the points in the database once you + observe $S$: + \begin{displaymath} + V(S) = \frac{1}{|D|}\sum_{x\in D} \sigma_x^2 - \sigma_{x|A}^2 + \end{displaymath} +\end{itemize} + +Once you have defined a notion of value, there are several problems you can +address: +\begin{enumerate} + \item \emph{budget auctions:} the points in the database are associated + with users. Each user $i$ has a cost $c_i$ for revealing his data. You + are given a budget $B$ and you want to select a subset $S$ of users and + a payment scheme which is truthful. The value of the subset + $S$ should be as big as possible with the constraint that the payments + should stay within budget. + \item \emph{value sharing:} the goal is to find a value sharing method to + split the whole value of a subset $S$ among its participants. + \item \emph{k-best auctions:} this one is very similar to \emph{budget + auctions}, but instead of having a budget constraint, the constraint is + on the number of people in the subset $S$. +\end{enumerate} + +These problems have already been studied, and optimal or near-optimal solution +are already known when the value function is submodular. Fortunately, it can be +shown that the information theoretic value defined above is submodular. + +\section{Submodularity} + +In this section, we will consider that we are given a \emph{universe} set $U$. +A set function $f$ is a function defined on the power set of $U$, $\mathfrak{P}(U)$. + +A set function $f$ defined on $\mathfrak{P}(U)$ will be said \emph{increasing} if +it is increasing with regards to inclusion, that is: -\section{pomme} - -In this section, we will consider that we are given a \emph{universe} -set $U$ such that all the sets we consider are subsets or $U$ and all -functions defined on sets are defined on the power set of $U$, -$\mathfrak{P}(U)$. - -A function $f$ defined on $\mathfrak{P}(U)$ will be said -\emph{increasing} if it is increasing with regards to inclusion, that -is: \begin{displaymath} \forall\,S\subseteq T,\quad f(S)\leq f(T) \end{displaymath} + A \emph{decreasing} function on $\mathfrak{P}(U)$ is defined similarly. +A set function $f$ defined on $\mathfrak{P}(U)$ is said to be +\emph{submodular} if it verifies the diminishing returns property, that is, +the marginal increments when adding a point to a set, is a set decreasing +function. More formally, for any point $x$ in $U$, we can define the marginal +increment of $f$ regarding $x$, it is the set function defined as: +\begin{displaymath} + \Delta_x f(S) = f(S\cup\{x\}) - f(S) +\end{displaymath} +Then, $f$ is \emph{submodular} iff. for all $x$ in $U$, $\Delta_x f$ is a set +decreasing function. + +Similarly, a \emph{supermodular} is a function whose marginal increments are set +increasing functions. \begin{prop} - Let $R:\mathbf{R}\rightarrow \mathbf{R}$ be a decreasing concave - function and $f:\mathfrak{P}(U)\rightarrow\mathbf{R}$ be a - decreasing submodular function, then the composed function $R\circ - f$ is increasing and supermodular. + Let $R:\mathbf{R}\rightarrow \mathbf{R}$ be a decreasing concave function and + $f:\mathfrak{P}(U)\rightarrow\mathbf{R}$ be a decreasing submodular function, + then the composed function $R\circ f$ is increasing and supermodular. \end{prop} \begin{proof} - The increasingness of $R\circ f$ follows immediately from the - decreasingness of $R$ and $f$. + The increasingness of $R\circ f$ follows immediately from the decreasingness + of $R$ and $f$. - For the supermodularity, let $S$ and $T$ be two sets such that - $S\subseteq T$. By decreasingness of $f$, we have: + For the supermodularity, let $S$ and $T$ be two sets such that $S\subseteq + T$. By decreasingness of $f$, we have: \begin{displaymath} \forall\,V,\quad f(T)\leq f(S)\quad\mathrm{and}\quad f(T\cup V)\leq f(S\cup V) \end{displaymath} @@ -90,288 +159,4 @@ A \emph{decreasing} function on $\mathfrak{P}(U)$ is defined similarly. which is exactly the supermodularity of $R\circ f$. \end{proof} - - -\section{Understanding the recommender system} - -\subsection{General problem} - -We already have a database $D_n$ of $n$ users. For each user $i$ we -have a set of $k$ explanatory variables (features), this is a vector -$x_i$. - -The problem is the following: we are about to start an experiment where for -some of the users in the database, a variable of interest $y_i$ will -be revealed (it could be for example a medical survey, a website which -buys some data from another website). From the pairs $(x_i, y_i)$ that -we will acquire through the experiment, we are going to compute a -regression function $f_n$ which will allow us to predict for a new $x$ -its associated $y$. The accuracy of this prediction will be measured -by the Mean Squared Error : -\begin{displaymath} - \mse(f_n) = \expt{\big(f_n(x)-y\big)^2} -\end{displaymath} - -We would like to understand the impact of the number of users who take -part in the experiment. Especially, how much does adding a user to the -experiment impacts the MSE. - -\subsection{From the bivariate normal case to linear regression} -If $(X,Y)$ is drawn from a bivariate normal distribution with mean -vector $\mu$ and covariance matrix $\Sigma$. Then, one can -write: -\begin{displaymath} - Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big) -\end{displaymath} -In this particular case, $\condexp{Y}{X}$ is a linear function of $X$: -\begin{displaymath} -\condexp{Y}{X} = \alpha X + \beta -\end{displaymath} -where $\alpha$ and $\beta$ can be expressed as a function of $\mu$ and -$\Sigma$. Writing $\varepsilon = Y-\condexp{Y}{X}$, it is easy to see -that $\expt{X\varepsilon}=0$. Furthermore $\varepsilon$ is also normally -distributed. Under these assumptions, it can be proven that the least -square estimator for $(\alpha,\beta)$ is optimal (it reaches the -Cramér-Rao bound). - -\subsection{Linear regression} - -We assume a linear model: -\begin{displaymath} - y_i = \beta\cdot x_i + \varepsilon_i -\end{displaymath} - -Where $\varepsilon_i$ is a normal random variable uncorrelated to -$x_i$. We also assume that $\var(\varepsilon_i)$ is constant -(homoscedasticity), $\sigma^2$ will denote the common value. - -From the database we compute the least-squared estimator of $\beta$: -\begin{displaymath} - \hat{\beta}_n = (\tr XX)^{-1}\tr XY -\end{displaymath} -where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory -variables) whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$. - -The regression function is simply the inner product of $\hat{\beta}_n$ -and the new vector of explanatory variables $x$. - -From there we can compute the MSE: -\begin{displaymath} - \mse(D_n) - =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2} - = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2} -\end{displaymath} -where we used that -$\expt{x\varepsilon}=0$. The variance-covariance matrix of -$\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get: -\begin{displaymath} - \mse(D_n) - = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2 -\end{displaymath} - -\subsubsection*{Monotonicity} - -We first want to study the impact of adding one observation to the -database. First, notice that: -\begin{displaymath} - \tr XX = \sum_{i=1}^n x_i \tr x_i -\end{displaymath} -Let write $A= \tr X X$. Then, adding $x_0$ to the database will change -$A$ to $A+x_0\tr x_0$. - -The following derivation will make an extensive use of the -Sherman-Morrisson formula \cite{sm} (which can be proven by direct -verification). For any invertible matrix $A$: -\begin{equation}\label{eq:inverse} -(A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y} -\end{equation} - -$A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is -also positive semidefinite. We will denote by $\ip{x}{y}$ the scalar product defined by $A^{-1}$, -that is: -\begin{displaymath} - \ip{x}{y} = \tr x A^{-1} y = \tr y A^{-1} x -\end{displaymath} - -Using \eqref{eq:inverse} we get: -\begin{displaymath} -\begin{split} - \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x - A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\ -& = \tr x A^{-1} x - \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2} -\end{split} -\end{displaymath} - -Thus: -\begin{equation}\label{eq:decrease} -\mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2\ip{x}{x_0}^2}{1+\norm{x_0}^2} -\end{equation} - -\emph{Adding one observation to the database decreases the MSE.} - -\subsubsection*{Submodularity} - -Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained -from $D_n$ by adding some observations. We would like to show that -adding one observation to $D_m$ yields a smaller decrease in MSE than -adding the same observation to $D_n$: -\begin{displaymath} - \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\}) -\end{displaymath} - -First, remark that it is necessary and sufficient to prove this property when $D_n$ and -$D_m$ differ by only one observation. Indeed, if the property is true -in general, then it will also be true when the two databases differ by -only one observation. Conversely, if the property is true when the two -databases differ by only one observation, then applying the property -repeatedly yields the general property. - -Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to -the database is: -\begin{displaymath} - \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} -\end{displaymath} - -If we denote by $z$ the additional observation present in $D_m$ and -not in $D_n$, then we would like to prove that: -\begin{displaymath} - \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} -\geq \frac{\sigma^2\left(\tr x (A+z\tr z)^{-1} x_0\right)^2}{1+\tr x_0 (A+z\tr z)^{-1} x_0} -\end{displaymath} - -Using the same notations as before, this is equivalent -to: -\begin{displaymath} - \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2} -\geq \frac{\left(\left(1+\norm{z}^2\right)\ip{x}{x_0}-\ip{x}{z}\ip{z}{x_0}\right)^2} -{\left(1+\norm{z}^2\right)\big((1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2\big)} -\end{displaymath} - -By the Cauchy-Schwarz inequality: -\begin{displaymath} - (1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2 > 0 -\end{displaymath} - -Thus the previous inequality is consequently equivalent to: -\begin{multline*} - \left(1+\norm{z}^2\right)^2\left(1+\norm{x_0}^2\right)\ip{x}{x_0}^2 --\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2\\ -\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)^2\ip{x}{x_0}^2 -+ \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2\\ --2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0} -\end{multline*} - -\begin{multline*} -2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}\\ -\geq \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2 -+ \left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2 -\end{multline*} - -This last inequality is not true in general. As soon as $x$, $x_0$ and -$z$ span a 2-dimensional space, it is possible that for example -$\ip{x}{x_0}$ and $\ip{x}{z}$ are positive and $\ip{z}{x_0}$ -negative. Then the left term of the inequality will be negative and -cannot be greater than the right term which is always positive. - -In the one-dimensional case, the inner product $\ip{x}{z}$ can be -written as $\lambda xz$ for some positive $\lambda$. Then the last -inequality becomes: -\begin{displaymath} - 2\geq \frac{\lambda z^2}{1+\lambda z^2} -+ \frac{\lambda x_0^2}{1+\lambda x_0^2} -\end{displaymath} -which is trivially true (a more direct proof for the one-dimensional -case is of course possible). - -In order to understand more precisely under which assumptions the -above inequality could become true, it is convenient to look at it -from the quadratic form perspective. Indeed this inequality can be -rewritten as: - -\begin{equation}\label{eq-inequality} -\tr x B x \geq 0 -\end{equation} - -with: -\begin{align*} - B = &\, \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z} - (x_0\tr z+z\tr x_0)\\ -& -\ip{x_0}{z}^2\Big( \left(1+\norm{x_0}^2\right)z\tr z + \left(1+\norm{z}^2\right)x_0\tr z\big) -\end{align*} - -This quadratic form is degenerate, its kernel is $x_0^{\bot}\cap -z^\bot$ which is of dimension $k-2$. - -\paragraph{Case when $\norm{x_0}=\norm{z}=1$} In this case, it suffices to study the quadratic form given by matrix -$B'$ with: -\begin{displaymath} - B' = 2\ip{x_0}{z}(x_0\tr z+z\tr x_0) -\ip{x_0}{z}^2(z\tr z + x_0\tr x_0) -\end{displaymath} - -Writing $a = \ip{x_0}{z}$, the two non-zero eigenvalues are: -\begin{align*} - \lambda_1 & = -2a^3 + 2a^2 + 4a = -2a(a+1)(a-2)\\ - \lambda_2 & = 2a^3 + 2a^2 - 4a = 2a(a-1)(a+2) -\end{align*} - -which are respectively associated with the eigenvectors: -\begin{align*} - x_1 & = x_0+z\\ - x_2 & = x_0 - z -\end{align*} - -By the Cauchy-Schwarz inequality, $a\in]-1,1[$, and the two -eigenvalues are of opposite sign on this interval. Thus inequality -\eqref{eq-inequality} does not hold for all $x$. - -\paragraph{In expectation?} If we assume a prior knowledge on the -distribution of $x$, writing $\Sigma$ the variance-covariance matrix -of $x$ and $\mu$ its mean vector, then taking the expectation of -\eqref{eq-inequality} we get: -\begin{displaymath} - \expt{\tr x B' x} = \trace(B'\Sigma) + \tr\mu B'\mu -\end{displaymath} - -\nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11,lse} -\bibliographystyle{plain} -\bibliography{notes.bib} - -\section*{Appendix} - -\paragraph{Previous attempt at taming the submodularity} - -The inequality only depends on the projection of $x$ on the plane -spanned by $x_0$ and $z$. Writing -\begin{displaymath} - x = \lambda x_0 + \mu z + v,\quad\mathrm{with}\quad v \,\bot\, - \mathrm{span}\{x_0, v\} -\end{displaymath} -the previous inequality becomes: -\begin{multline*} -2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z} -\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right) -\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)\\ -- \left(1+\norm{x_0}^2\right)\ip{x_0}{z}^2\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)^2\\ -- -\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)^2\geq 0 -\end{multline*} - -By expanding and reordering the terms, we obtain a quadratic function -of $\lambda$ and $\mu$: -\begin{multline*} - \lambda^2\ip{x_0}{z}^2\left[2\norm{x_0}^2+\norm{x_0}^4+\norm{x_0}^2\norm{z}^2 - +(1+\norm{x_0})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\ -+\mu^2\ip{x_0}{z}^2\left[2\norm{z}^2+\norm{z}^4+\norm{x_0}^2\norm{z}^2 - +(1+\norm{z})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\ -+2\lambda\mu\ip{x_0}{z}\Big[\norm{x_0}^2\norm{z}^2 -\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\\ -+\ip{x_0}{z}^2+\norm{x_0}^2\norm{z}^2 -\big(1+\norm{x_0}^2+\norm{z}^2\big)\Big]\geq 0 -\end{multline*} - -This inequality will be true for all $\lambda$ and $\mu$ if and only -if the quadratic form is positive semidefinite. As its trace is -positive, this is equivalent to the positiveness of its determinant. - - -\end{document} \ No newline at end of file +\end{document} diff --git a/notes2.tex b/notes2.tex deleted file mode 100644 index 92eabb5..0000000 --- a/notes2.tex +++ /dev/null @@ -1,162 +0,0 @@ -\documentclass{article} -\usepackage[utf8]{inputenc} -\usepackage{amsmath,amsthm,amsfonts} -\usepackage{comment} -\newtheorem{lemma}{Lemma} -\newtheorem{fact}{Fact} -\newtheorem{example}{Example} -\newtheorem{prop}{Proposition} -\newcommand{\var}{\mathop{\mathrm{Var}}} -\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]} -\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} -\newcommand{\norm}[1]{\lVert#1\rVert} -\newcommand{\tr}[1]{#1^*} -\newcommand{\ip}[2]{\langle #1, #2 \rangle} -\newcommand{\mse}{\mathop{\mathrm{MSE}}} -\newcommand{\trace}{\mathop{\mathrm{tr}}} -\begin{document} - -\section{Problem} - -\begin{itemize} -\item database $D = (X_i)_{1\leq i\leq n}$ -\item $(X_i)_{1\leq i\leq n}\in\big(\mathbf{R^d}\big)^n$ sampled in an i.i.d - fashion from the probability distribution $\mu$ -\end{itemize} - -We assume that there is a common variable $P$ which is the underlying -phenomenon that we are interesting in, and all the variables in the database, -are noisy realisations of this phenomenon: -\begin{displaymath} - X_i = P + N_i -\end{displaymath} -where the $N_i$ are pairwise independent and also independent from $H$. - - -We are interested in defining the $\emph{value}$ of a subset $S$ of the -database $D$. The approaches to such a definition can be classified in two main -families: -\begin{itemize} - \item \emph{Information theoretic:} We use the entropy of $P$ as a measure - of the uncertainty. Then the value of a subset $S$ is the decrease of - entropy induced by revealing it: - \begin{displaymath} - V(S) = H(P) - H(P\,|\,S) - \end{displaymath} - \item \emph{Statistical:} in this case you are trying to estimate the - phenomenon. The variance of the estimation is used as a measure of - the uncertainty. The value of a set $S$ can be defined as the average - reduction of the variance over all the points in the database once you - observe $S$: - \begin{displaymath} - V(S) = \frac{1}{|D|}\sum_{x\in D} \sigma_x^2 - \sigma_{x|A}^2 - \end{displaymath} -\end{itemize} - -Once you have defined a notion of value, there are several problems you can -address: -\begin{enumerate} - \item \emph{budget auctions:} the points in the database are associated - with users. Each user $i$ has a cost $c_i$ for revealing his data. You - are given a budget $B$ and you want to select a subset $S$ of users and - a payment scheme which is truthful. The value of the subset - $S$ should be as big as possible with the constraint that the payments - should stay within budget. - \item \emph{value sharing:} the goal is to find a value sharing method to - split the whole value of a subset $S$ among its participants. - \item \emph{k-best auctions:} this one is very similar to \emph{budget - auctions}, but instead of having a budget constraint, the constraint is - on the number of people in the subset $S$. -\end{enumerate} - -These problems have already been studied, and optimal or near-optimal solution -are already known when the value function is submodular. Fortunately, it can be -shown that the information theoretic value defined above is submodular. - -\section{Submodularity} - -In this section, we will consider that we are given a \emph{universe} set $U$. -A set function $f$ is a function defined on the power set of $U$, $\mathfrak{P}(U)$. - -A set function $f$ defined on $\mathfrak{P}(U)$ will be said \emph{increasing} if -it is increasing with regards to inclusion, that is: - -\begin{displaymath} - \forall\,S\subseteq T,\quad f(S)\leq f(T) -\end{displaymath} - -A \emph{decreasing} function on $\mathfrak{P}(U)$ is defined similarly. - -A set function $f$ defined on $\mathfrak{P}(U)$ is said to be -\emph{submodular} if it verifies the diminishing returns property, that is, -the marginal increments when adding a point to a set, is a set decreasing -function. More formally, for any point $x$ in $U$, we can define the marginal -increment of $f$ regarding $x$, it is the set function defined as: -\begin{displaymath} - \Delta_x f(S) = f(S\cup\{x\}) - f(S) -\end{displaymath} -Then, $f$ is \emph{submodular} iff. for all $x$ in $U$, $\Delta_x f$ is a set -decreasing function. - -Similarly, a \emph{supermodular} is a function whose marginal increments are set -increasing functions. -\begin{prop} - Let $R:\mathbf{R}\rightarrow \mathbf{R}$ be a decreasing concave function and - $f:\mathfrak{P}(U)\rightarrow\mathbf{R}$ be a decreasing submodular function, - then the composed function $R\circ f$ is increasing and supermodular. -\end{prop} - -\begin{proof} - The increasingness of $R\circ f$ follows immediately from the decreasingness - of $R$ and $f$. - - For the supermodularity, let $S$ and $T$ be two sets such that $S\subseteq - T$. By decreasingness of $f$, we have: - \begin{displaymath} - \forall\,V,\quad f(T)\leq f(S)\quad\mathrm{and}\quad f(T\cup V)\leq f(S\cup V) - \end{displaymath} - - Thus, by concavity of $R$: - \begin{displaymath}\label{eq:base} - \forall\,V,\quad\frac{R\big(f(S)\big)-R\big(f(S\cup V)\big)}{f(S)-f(S\cup V)} - \leq\frac{R\big(f(T)\big)-R\big(f(T\cup V)\big)}{f(T)-f(T\cup V)} - \end{displaymath} - - $f$ is decreasing, so multiplying this last inequality by - $f(S)-f(S\cup V)$ and $f(T)-f(T\cup V)$ yields: - \begin{multline} - \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(T)-f(T\cup V)\big)\\ - \leq \Big(R\big(f(T)\big)-R\big(f(T\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big) - \end{multline} - - $f$ is submodular, so: - \begin{displaymath} - f(T\cup V)-f(T)\leq f(S\cup V) - f(S) - \end{displaymath} - - $R\circ f$ is increasing, so: - \begin{displaymath} - R\big(f(S)\big)-R\big(f(S\cup V)\big)\leq 0 - \end{displaymath} - - By combining the two previous inequalities, we get: - \begin{multline*} - \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)\\ - \leq \Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(T)-f(T\cup V)\big) - \end{multline*} - - Injecting this last inequality into \eqref{eq:base} gives: - \begin{multline*} - \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)\\ - \leq \Big(R\big(f(T)\big)-R\big(f(T\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big) - \end{multline*} - - Dividing left and right by $f(S)-f(S\cup V)$ yields: - \begin{displaymath} - \forall V,\quad R\big(f(S)\big)-R\big(f(S\cup V)\big) - \leq R\big(f(T)\big)-R\big(f(T\cup V)\big) - \end{displaymath} - which is exactly the supermodularity of $R\circ f$. -\end{proof} - -\end{document} diff --git a/old_notes.tex b/old_notes.tex new file mode 100644 index 0000000..8dc2c7b --- /dev/null +++ b/old_notes.tex @@ -0,0 +1,377 @@ +\documentclass{article} +\usepackage[utf8]{inputenc} +\usepackage{amsmath,amsthm,amsfonts} +\usepackage{comment} +\newtheorem{lemma}{Lemma} +\newtheorem{prop}{Proposition} +\newcommand{\var}{\mathop{\mathrm{Var}}} +\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]} +\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} +\newcommand{\norm}[1]{\lVert#1\rVert} +\newcommand{\tr}[1]{#1^*} +\newcommand{\ip}[2]{\langle #1, #2 \rangle} +\newcommand{\mse}{\mathop{\mathrm{MSE}}} +\newcommand{\trace}{\mathop{\mathrm{tr}}} +\begin{document} + + +\section{pomme} + +In this section, we will consider that we are given a \emph{universe} +set $U$ such that all the sets we consider are subsets or $U$ and all +functions defined on sets are defined on the power set of $U$, +$\mathfrak{P}(U)$. + +A function $f$ defined on $\mathfrak{P}(U)$ will be said +\emph{increasing} if it is increasing with regards to inclusion, that +is: +\begin{displaymath} + \forall\,S\subseteq T,\quad f(S)\leq f(T) +\end{displaymath} +A \emph{decreasing} function on $\mathfrak{P}(U)$ is defined similarly. + +\begin{prop} + Let $R:\mathbf{R}\rightarrow \mathbf{R}$ be a decreasing concave + function and $f:\mathfrak{P}(U)\rightarrow\mathbf{R}$ be a + decreasing submodular function, then the composed function $R\circ + f$ is increasing and supermodular. +\end{prop} + +\begin{proof} + The increasingness of $R\circ f$ follows immediately from the + decreasingness of $R$ and $f$. + + For the supermodularity, let $S$ and $T$ be two sets such that + $S\subseteq T$. By decreasingness of $f$, we have: + \begin{displaymath} + \forall\,V,\quad f(T)\leq f(S)\quad\mathrm{and}\quad f(T\cup V)\leq f(S\cup V) + \end{displaymath} + + Thus, by concavity of $R$: + \begin{displaymath}\label{eq:base} + \forall\,V,\quad\frac{R\big(f(S)\big)-R\big(f(S\cup V)\big)}{f(S)-f(S\cup V)} + \leq\frac{R\big(f(T)\big)-R\big(f(T\cup V)\big)}{f(T)-f(T\cup V)} + \end{displaymath} + + $f$ is decreasing, so multiplying this last inequality by + $f(S)-f(S\cup V)$ and $f(T)-f(T\cup V)$ yields: + \begin{multline} + \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(T)-f(T\cup V)\big)\\ + \leq \Big(R\big(f(T)\big)-R\big(f(T\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big) + \end{multline} + + $f$ is submodular, so: + \begin{displaymath} + f(T\cup V)-f(T)\leq f(S\cup V) - f(S) + \end{displaymath} + + $R\circ f$ is increasing, so: + \begin{displaymath} + R\big(f(S)\big)-R\big(f(S\cup V)\big)\leq 0 + \end{displaymath} + + By combining the two previous inequalities, we get: + \begin{multline*} + \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)\\ + \leq \Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(T)-f(T\cup V)\big) + \end{multline*} + + Injecting this last inequality into \eqref{eq:base} gives: + \begin{multline*} + \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)\\ + \leq \Big(R\big(f(T)\big)-R\big(f(T\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big) + \end{multline*} + + Dividing left and right by $f(S)-f(S\cup V)$ yields: + \begin{displaymath} + \forall V,\quad R\big(f(S)\big)-R\big(f(S\cup V)\big) + \leq R\big(f(T)\big)-R\big(f(T\cup V)\big) + \end{displaymath} + which is exactly the supermodularity of $R\circ f$. +\end{proof} + + + +\section{Understanding the recommender system} + +\subsection{General problem} + +We already have a database $D_n$ of $n$ users. For each user $i$ we +have a set of $k$ explanatory variables (features), this is a vector +$x_i$. + +The problem is the following: we are about to start an experiment where for +some of the users in the database, a variable of interest $y_i$ will +be revealed (it could be for example a medical survey, a website which +buys some data from another website). From the pairs $(x_i, y_i)$ that +we will acquire through the experiment, we are going to compute a +regression function $f_n$ which will allow us to predict for a new $x$ +its associated $y$. The accuracy of this prediction will be measured +by the Mean Squared Error : +\begin{displaymath} + \mse(f_n) = \expt{\big(f_n(x)-y\big)^2} +\end{displaymath} + +We would like to understand the impact of the number of users who take +part in the experiment. Especially, how much does adding a user to the +experiment impacts the MSE. + +\subsection{From the bivariate normal case to linear regression} +If $(X,Y)$ is drawn from a bivariate normal distribution with mean +vector $\mu$ and covariance matrix $\Sigma$. Then, one can +write: +\begin{displaymath} + Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big) +\end{displaymath} +In this particular case, $\condexp{Y}{X}$ is a linear function of $X$: +\begin{displaymath} +\condexp{Y}{X} = \alpha X + \beta +\end{displaymath} +where $\alpha$ and $\beta$ can be expressed as a function of $\mu$ and +$\Sigma$. Writing $\varepsilon = Y-\condexp{Y}{X}$, it is easy to see +that $\expt{X\varepsilon}=0$. Furthermore $\varepsilon$ is also normally +distributed. Under these assumptions, it can be proven that the least +square estimator for $(\alpha,\beta)$ is optimal (it reaches the +Cramér-Rao bound). + +\subsection{Linear regression} + +We assume a linear model: +\begin{displaymath} + y_i = \beta\cdot x_i + \varepsilon_i +\end{displaymath} + +Where $\varepsilon_i$ is a normal random variable uncorrelated to +$x_i$. We also assume that $\var(\varepsilon_i)$ is constant +(homoscedasticity), $\sigma^2$ will denote the common value. + +From the database we compute the least-squared estimator of $\beta$: +\begin{displaymath} + \hat{\beta}_n = (\tr XX)^{-1}\tr XY +\end{displaymath} +where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory +variables) whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$. + +The regression function is simply the inner product of $\hat{\beta}_n$ +and the new vector of explanatory variables $x$. + +From there we can compute the MSE: +\begin{displaymath} + \mse(D_n) + =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2} + = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2} +\end{displaymath} +where we used that +$\expt{x\varepsilon}=0$. The variance-covariance matrix of +$\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get: +\begin{displaymath} + \mse(D_n) + = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2 +\end{displaymath} + +\subsubsection*{Monotonicity} + +We first want to study the impact of adding one observation to the +database. First, notice that: +\begin{displaymath} + \tr XX = \sum_{i=1}^n x_i \tr x_i +\end{displaymath} +Let write $A= \tr X X$. Then, adding $x_0$ to the database will change +$A$ to $A+x_0\tr x_0$. + +The following derivation will make an extensive use of the +Sherman-Morrisson formula \cite{sm} (which can be proven by direct +verification). For any invertible matrix $A$: +\begin{equation}\label{eq:inverse} +(A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y} +\end{equation} + +$A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is +also positive semidefinite. We will denote by $\ip{x}{y}$ the scalar product defined by $A^{-1}$, +that is: +\begin{displaymath} + \ip{x}{y} = \tr x A^{-1} y = \tr y A^{-1} x +\end{displaymath} + +Using \eqref{eq:inverse} we get: +\begin{displaymath} +\begin{split} + \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x + A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\ +& = \tr x A^{-1} x - \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2} +\end{split} +\end{displaymath} + +Thus: +\begin{equation}\label{eq:decrease} +\mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2\ip{x}{x_0}^2}{1+\norm{x_0}^2} +\end{equation} + +\emph{Adding one observation to the database decreases the MSE.} + +\subsubsection*{Submodularity} + +Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained +from $D_n$ by adding some observations. We would like to show that +adding one observation to $D_m$ yields a smaller decrease in MSE than +adding the same observation to $D_n$: +\begin{displaymath} + \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\}) +\end{displaymath} + +First, remark that it is necessary and sufficient to prove this property when $D_n$ and +$D_m$ differ by only one observation. Indeed, if the property is true +in general, then it will also be true when the two databases differ by +only one observation. Conversely, if the property is true when the two +databases differ by only one observation, then applying the property +repeatedly yields the general property. + +Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to +the database is: +\begin{displaymath} + \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} +\end{displaymath} + +If we denote by $z$ the additional observation present in $D_m$ and +not in $D_n$, then we would like to prove that: +\begin{displaymath} + \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} +\geq \frac{\sigma^2\left(\tr x (A+z\tr z)^{-1} x_0\right)^2}{1+\tr x_0 (A+z\tr z)^{-1} x_0} +\end{displaymath} + +Using the same notations as before, this is equivalent +to: +\begin{displaymath} + \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2} +\geq \frac{\left(\left(1+\norm{z}^2\right)\ip{x}{x_0}-\ip{x}{z}\ip{z}{x_0}\right)^2} +{\left(1+\norm{z}^2\right)\big((1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2\big)} +\end{displaymath} + +By the Cauchy-Schwarz inequality: +\begin{displaymath} + (1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2 > 0 +\end{displaymath} + +Thus the previous inequality is consequently equivalent to: +\begin{multline*} + \left(1+\norm{z}^2\right)^2\left(1+\norm{x_0}^2\right)\ip{x}{x_0}^2 +-\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2\\ +\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)^2\ip{x}{x_0}^2 ++ \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2\\ +-2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0} +\end{multline*} + +\begin{multline*} +2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}\\ +\geq \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2 ++ \left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2 +\end{multline*} + +This last inequality is not true in general. As soon as $x$, $x_0$ and +$z$ span a 2-dimensional space, it is possible that for example +$\ip{x}{x_0}$ and $\ip{x}{z}$ are positive and $\ip{z}{x_0}$ +negative. Then the left term of the inequality will be negative and +cannot be greater than the right term which is always positive. + +In the one-dimensional case, the inner product $\ip{x}{z}$ can be +written as $\lambda xz$ for some positive $\lambda$. Then the last +inequality becomes: +\begin{displaymath} + 2\geq \frac{\lambda z^2}{1+\lambda z^2} ++ \frac{\lambda x_0^2}{1+\lambda x_0^2} +\end{displaymath} +which is trivially true (a more direct proof for the one-dimensional +case is of course possible). + +In order to understand more precisely under which assumptions the +above inequality could become true, it is convenient to look at it +from the quadratic form perspective. Indeed this inequality can be +rewritten as: + +\begin{equation}\label{eq-inequality} +\tr x B x \geq 0 +\end{equation} + +with: +\begin{align*} + B = &\, \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z} + (x_0\tr z+z\tr x_0)\\ +& -\ip{x_0}{z}^2\Big( \left(1+\norm{x_0}^2\right)z\tr z + \left(1+\norm{z}^2\right)x_0\tr z\big) +\end{align*} + +This quadratic form is degenerate, its kernel is $x_0^{\bot}\cap +z^\bot$ which is of dimension $k-2$. + +\paragraph{Case when $\norm{x_0}=\norm{z}=1$} In this case, it suffices to study the quadratic form given by matrix +$B'$ with: +\begin{displaymath} + B' = 2\ip{x_0}{z}(x_0\tr z+z\tr x_0) -\ip{x_0}{z}^2(z\tr z + x_0\tr x_0) +\end{displaymath} + +Writing $a = \ip{x_0}{z}$, the two non-zero eigenvalues are: +\begin{align*} + \lambda_1 & = -2a^3 + 2a^2 + 4a = -2a(a+1)(a-2)\\ + \lambda_2 & = 2a^3 + 2a^2 - 4a = 2a(a-1)(a+2) +\end{align*} + +which are respectively associated with the eigenvectors: +\begin{align*} + x_1 & = x_0+z\\ + x_2 & = x_0 - z +\end{align*} + +By the Cauchy-Schwarz inequality, $a\in]-1,1[$, and the two +eigenvalues are of opposite sign on this interval. Thus inequality +\eqref{eq-inequality} does not hold for all $x$. + +\paragraph{In expectation?} If we assume a prior knowledge on the +distribution of $x$, writing $\Sigma$ the variance-covariance matrix +of $x$ and $\mu$ its mean vector, then taking the expectation of +\eqref{eq-inequality} we get: +\begin{displaymath} + \expt{\tr x B' x} = \trace(B'\Sigma) + \tr\mu B'\mu +\end{displaymath} + +\nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11,lse} +\bibliographystyle{plain} +\bibliography{notes.bib} + +\section*{Appendix} + +\paragraph{Previous attempt at taming the submodularity} + +The inequality only depends on the projection of $x$ on the plane +spanned by $x_0$ and $z$. Writing +\begin{displaymath} + x = \lambda x_0 + \mu z + v,\quad\mathrm{with}\quad v \,\bot\, + \mathrm{span}\{x_0, v\} +\end{displaymath} +the previous inequality becomes: +\begin{multline*} +2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z} +\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right) +\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)\\ +- \left(1+\norm{x_0}^2\right)\ip{x_0}{z}^2\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)^2\\ +- +\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)^2\geq 0 +\end{multline*} + +By expanding and reordering the terms, we obtain a quadratic function +of $\lambda$ and $\mu$: +\begin{multline*} + \lambda^2\ip{x_0}{z}^2\left[2\norm{x_0}^2+\norm{x_0}^4+\norm{x_0}^2\norm{z}^2 + +(1+\norm{x_0})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\ ++\mu^2\ip{x_0}{z}^2\left[2\norm{z}^2+\norm{z}^4+\norm{x_0}^2\norm{z}^2 + +(1+\norm{z})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\ ++2\lambda\mu\ip{x_0}{z}\Big[\norm{x_0}^2\norm{z}^2 +\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\\ ++\ip{x_0}{z}^2+\norm{x_0}^2\norm{z}^2 +\big(1+\norm{x_0}^2+\norm{z}^2\big)\Big]\geq 0 +\end{multline*} + +This inequality will be true for all $\lambda$ and $\mu$ if and only +if the quadratic form is positive semidefinite. As its trace is +positive, this is equivalent to the positiveness of its determinant. + + +\end{document} \ No newline at end of file diff --git a/schema.pdf b/schema.pdf deleted file mode 100644 index c44eb38..0000000 Binary files a/schema.pdf and /dev/null differ diff --git a/slides.tex b/slides.tex deleted file mode 100644 index ec2e5be..0000000 --- a/slides.tex +++ /dev/null @@ -1,221 +0,0 @@ -\documentclass{beamer} -\usepackage[utf8x]{inputenc} -\usepackage[greek,english]{babel} -\usepackage[LGR,T1]{fontenc} -\usepackage{lmodern} -\usepackage{amsmath} -\usepackage{graphicx} -\usepackage{tikz} -\newcommand{\tr}[1]{#1^*} -\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} -\newcommand{\mse}{\mathop{\mathrm{MSE}}} -\usetheme{Boadilla} -\usecolortheme{beaver} -\title[Data monetization]{Data monetization: pricing user data\\with the Shapley Value} -\author{Thibaut \textsc{Horel}} -\institute[Technicolor]{Work with {\greektext Στρατής \textsc{Ιωαννίδης}}} -\setbeamercovered{transparent} - -\AtBeginSection[] -{ -\begin{frame} -\frametitle{Outline} -\tableofcontents[currentsection] -\end{frame} -} - -\begin{document} - -\begin{frame} -\maketitle -\end{frame} - -\section{Problem} -\begin{frame}{Problem Overview} -\begin{center} -\includegraphics[width=0.8\textwidth]{schema.pdf} -\end{center} -\end{frame} - -\begin{frame}{Motivation} - -Why is it useful? - -\begin{itemize} -\item sell/buy user data -\item compensate users in your database -\end{itemize} - -\end{frame} - -\begin{frame}{Challenges} - -We need to understand: -\begin{itemize} -\item the recommender system -\item the revenue structure -\item the relation between the revenue and the value of the database -\item how to compensate individual users -\end{itemize} -\end{frame} - -\section{Shapley value} - -\begin{frame}{Individual value} - \begin{itemize} - \item $S$: set of players - \item $S'\subset S$: coalition - \item $V: \mathcal{P}(S)\to \mathbf{R}$: \alert{value} function: - \begin{itemize} - \item $V(\emptyset) = 0$ - \item $V(S')$: value of the coalition $S'$ - \end{itemize} - \end{itemize} - - \alert{Question:} How to split the value of a subset across all its constituents? - - \alert{Goal:} $\phi_i(S',V)$: value of player $i$ in $S'$ -\end{frame} - -\begin{frame}{Solution attempts} - \begin{itemize} - \visible<1->{\item $\phi_i(S',V) = V(\{i\})$?} \visible<2->{``The whole is greater than the sum of its parts'' - $$V(S')\neq \sum_{i\in S'} V(\{i\})$$} - \visible<3->{\item $\phi_i(S',V) = V(S') - V(S'\setminus\{i\})$: the \alert{marginal contribution}?} \visible<4->{depends on the time the player joins the coalition} - \end{itemize} -\end{frame} - -\begin{frame}{Axioms} - \begin{description} - \item[Efficiency:] No money lost in the splitting process - $$V(S) = \sum_{i\in S} \phi_i(S,V)$$ -\pause - \item[Symmetry:] Equal pay for equal contribution - - \alert{if} $\forall S'\subset S\setminus\{i,j\}, V(S'\cup\{i\}) = V(S'\cup\{j\})$ - - \alert{then} $\phi_i(S,V) = \phi_j(S,V)$ -\pause - \item[Fairness:] $j$'s contribution to $i$ equals $i$'s contribution to $j$ - $$\phi_i(S,V) - \phi_i(S\setminus\{j\}) = \phi_j(S,V)- \phi_j(S\setminus\{i\})$$ - \end{description} -\end{frame} - -\begin{frame}{Solution: the Shapley Value} - - \begin{theorem}[Shapley, 1953] - There is only one function satisfying Efficiency, Symmetry and - Fairness: - $$\phi_i(S,V) = \frac{1}{\vert S\vert!}\sum_{\pi\in\mathfrak{S}(S)}\Delta_i\big(V,S(i,\pi)\big)$$ - \end{theorem} - \begin{itemize} - \item $\mathfrak{S}(S)$: set of permutations of $S$ - \item $S(i,\pi)$: set of players before $i$ in $\pi$ - \item $\Delta_i\big(V,T\big) = - V\big(T\big)-V\big(T\setminus\{i\})$: marginal contribution of $i$ - to $T$ - \end{itemize} -\end{frame} - -\begin{frame}{Properties} - \begin{itemize} - \item - \alert{if} $V$ is superadditive: $V(S\cup T) \geq V(S) + V(T)$ -\pause - - \alert{then} the Shapley Value is individually rational: - $\phi_i(S,V) > V(\{i\})$ -\pause - \item - \alert{if} $V$ is supermodular: $\Delta_i(V,S)\leq\Delta_i(V,T), - \forall S\subset T$ -\pause - - \alert{then} the Shapley Value lies in the core of - the game: - $$\sum_{i\in S'}\phi_i(S,V) \geq V(S'), \forall S'\subset S$$ -\pause - \item Hervé Moulin \& Scott Shenker (1999): Group-strategyproof auction - \end{itemize} -\end{frame} - -\section{Example scenario} - -\begin{frame}{Setup} - \begin{itemize} - \item database $X$: $n\times p$ matrix - - \item $x_i\in \mathbf{R}^p$ $i$-th row of $X$: data of user $i$ - - \item for each user: $y_i$ variable of interest, could be: - \begin{itemize} - \item already observed - \item about to be observed - \end{itemize} - \end{itemize} -\end{frame} - -\begin{frame}{Recommender system} -\alert{Goal:} predict the variable of interest $y$ of a new user $x$ - -\alert{Linear model:} relation between data and variable of interest: -$$y = \langle\beta,x\rangle + \varepsilon, \quad \varepsilon\simeq\mathcal{N}(0,\sigma^2)$$ - -\alert{Linear regression:} estimator $\tilde{\beta} = -(\tr{X}X)^{-1}\tr{X}Y$ - -\alert{Prediction:} $\tilde{y} = \langle\tilde{\beta},x\rangle$ -\end{frame} - -\begin{frame}{Revenue structure} -\alert{Prediction error:} $\mse(X,x) = \expt{(\tilde{y} - y)^2} = -\sigma^2\tr{x}(\tr{X}X)^{-1}x + \sigma^2$ -\vspace{1cm} -\begin{block}{Property} - \emph{The prediction error decreases as the size of the database increases} -\end{block} -\vspace{1cm} -\alert{$\Rightarrow$} choose a decreasing function of the prediction -error -\pause - -\alert{Problem:} $\mse$ not submodular -\end{frame} - -\begin{frame}{Revenue structure (2)} -Exponentially decreasing revenue: -$$V(X,x) = \exp\big(-\mse(X,x)\big)$$ - -Value of database $X$: -$$V(X) = \int_{x\in\mathbb{R}^d} V(X,x) dx$$ -\end{frame} - -\begin{frame}{Properties} -\begin{block}{Closed-form expression} -$$V(X) = C\sqrt{\det(\tr{X}X)}$$ -\end{block} - -\vspace{1cm} -\pause -\begin{block}{Supermodularity} -$$X\mapsto\log\big(V(X)\big)$$ is supermodular -\end{block} -\pause -\alert{$\Rightarrow$} we can apply the Shapley value theory! -\end{frame} - -\begin{frame}{Conclusion} - -\begin{itemize} -\item general framework to study the pricing problem -\item the case of the linear regression -\end{itemize} -\vspace{1cm} -\pause -Future directions -\begin{itemize} -\item different revenue models -\item fluid Shapley Value (Aumann-Shapley) for a continuum of players -\end{itemize} -\end{frame} -\end{document} \ No newline at end of file diff --git a/slides/schema.pdf b/slides/schema.pdf new file mode 100644 index 0000000..c44eb38 Binary files /dev/null and b/slides/schema.pdf differ diff --git a/slides/slides.tex b/slides/slides.tex new file mode 100644 index 0000000..ec2e5be --- /dev/null +++ b/slides/slides.tex @@ -0,0 +1,221 @@ +\documentclass{beamer} +\usepackage[utf8x]{inputenc} +\usepackage[greek,english]{babel} +\usepackage[LGR,T1]{fontenc} +\usepackage{lmodern} +\usepackage{amsmath} +\usepackage{graphicx} +\usepackage{tikz} +\newcommand{\tr}[1]{#1^*} +\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} +\newcommand{\mse}{\mathop{\mathrm{MSE}}} +\usetheme{Boadilla} +\usecolortheme{beaver} +\title[Data monetization]{Data monetization: pricing user data\\with the Shapley Value} +\author{Thibaut \textsc{Horel}} +\institute[Technicolor]{Work with {\greektext Στρατής \textsc{Ιωαννίδης}}} +\setbeamercovered{transparent} + +\AtBeginSection[] +{ +\begin{frame} +\frametitle{Outline} +\tableofcontents[currentsection] +\end{frame} +} + +\begin{document} + +\begin{frame} +\maketitle +\end{frame} + +\section{Problem} +\begin{frame}{Problem Overview} +\begin{center} +\includegraphics[width=0.8\textwidth]{schema.pdf} +\end{center} +\end{frame} + +\begin{frame}{Motivation} + +Why is it useful? + +\begin{itemize} +\item sell/buy user data +\item compensate users in your database +\end{itemize} + +\end{frame} + +\begin{frame}{Challenges} + +We need to understand: +\begin{itemize} +\item the recommender system +\item the revenue structure +\item the relation between the revenue and the value of the database +\item how to compensate individual users +\end{itemize} +\end{frame} + +\section{Shapley value} + +\begin{frame}{Individual value} + \begin{itemize} + \item $S$: set of players + \item $S'\subset S$: coalition + \item $V: \mathcal{P}(S)\to \mathbf{R}$: \alert{value} function: + \begin{itemize} + \item $V(\emptyset) = 0$ + \item $V(S')$: value of the coalition $S'$ + \end{itemize} + \end{itemize} + + \alert{Question:} How to split the value of a subset across all its constituents? + + \alert{Goal:} $\phi_i(S',V)$: value of player $i$ in $S'$ +\end{frame} + +\begin{frame}{Solution attempts} + \begin{itemize} + \visible<1->{\item $\phi_i(S',V) = V(\{i\})$?} \visible<2->{``The whole is greater than the sum of its parts'' + $$V(S')\neq \sum_{i\in S'} V(\{i\})$$} + \visible<3->{\item $\phi_i(S',V) = V(S') - V(S'\setminus\{i\})$: the \alert{marginal contribution}?} \visible<4->{depends on the time the player joins the coalition} + \end{itemize} +\end{frame} + +\begin{frame}{Axioms} + \begin{description} + \item[Efficiency:] No money lost in the splitting process + $$V(S) = \sum_{i\in S} \phi_i(S,V)$$ +\pause + \item[Symmetry:] Equal pay for equal contribution + + \alert{if} $\forall S'\subset S\setminus\{i,j\}, V(S'\cup\{i\}) = V(S'\cup\{j\})$ + + \alert{then} $\phi_i(S,V) = \phi_j(S,V)$ +\pause + \item[Fairness:] $j$'s contribution to $i$ equals $i$'s contribution to $j$ + $$\phi_i(S,V) - \phi_i(S\setminus\{j\}) = \phi_j(S,V)- \phi_j(S\setminus\{i\})$$ + \end{description} +\end{frame} + +\begin{frame}{Solution: the Shapley Value} + + \begin{theorem}[Shapley, 1953] + There is only one function satisfying Efficiency, Symmetry and + Fairness: + $$\phi_i(S,V) = \frac{1}{\vert S\vert!}\sum_{\pi\in\mathfrak{S}(S)}\Delta_i\big(V,S(i,\pi)\big)$$ + \end{theorem} + \begin{itemize} + \item $\mathfrak{S}(S)$: set of permutations of $S$ + \item $S(i,\pi)$: set of players before $i$ in $\pi$ + \item $\Delta_i\big(V,T\big) = + V\big(T\big)-V\big(T\setminus\{i\})$: marginal contribution of $i$ + to $T$ + \end{itemize} +\end{frame} + +\begin{frame}{Properties} + \begin{itemize} + \item + \alert{if} $V$ is superadditive: $V(S\cup T) \geq V(S) + V(T)$ +\pause + + \alert{then} the Shapley Value is individually rational: + $\phi_i(S,V) > V(\{i\})$ +\pause + \item + \alert{if} $V$ is supermodular: $\Delta_i(V,S)\leq\Delta_i(V,T), + \forall S\subset T$ +\pause + + \alert{then} the Shapley Value lies in the core of + the game: + $$\sum_{i\in S'}\phi_i(S,V) \geq V(S'), \forall S'\subset S$$ +\pause + \item Hervé Moulin \& Scott Shenker (1999): Group-strategyproof auction + \end{itemize} +\end{frame} + +\section{Example scenario} + +\begin{frame}{Setup} + \begin{itemize} + \item database $X$: $n\times p$ matrix + + \item $x_i\in \mathbf{R}^p$ $i$-th row of $X$: data of user $i$ + + \item for each user: $y_i$ variable of interest, could be: + \begin{itemize} + \item already observed + \item about to be observed + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}{Recommender system} +\alert{Goal:} predict the variable of interest $y$ of a new user $x$ + +\alert{Linear model:} relation between data and variable of interest: +$$y = \langle\beta,x\rangle + \varepsilon, \quad \varepsilon\simeq\mathcal{N}(0,\sigma^2)$$ + +\alert{Linear regression:} estimator $\tilde{\beta} = +(\tr{X}X)^{-1}\tr{X}Y$ + +\alert{Prediction:} $\tilde{y} = \langle\tilde{\beta},x\rangle$ +\end{frame} + +\begin{frame}{Revenue structure} +\alert{Prediction error:} $\mse(X,x) = \expt{(\tilde{y} - y)^2} = +\sigma^2\tr{x}(\tr{X}X)^{-1}x + \sigma^2$ +\vspace{1cm} +\begin{block}{Property} + \emph{The prediction error decreases as the size of the database increases} +\end{block} +\vspace{1cm} +\alert{$\Rightarrow$} choose a decreasing function of the prediction +error +\pause + +\alert{Problem:} $\mse$ not submodular +\end{frame} + +\begin{frame}{Revenue structure (2)} +Exponentially decreasing revenue: +$$V(X,x) = \exp\big(-\mse(X,x)\big)$$ + +Value of database $X$: +$$V(X) = \int_{x\in\mathbb{R}^d} V(X,x) dx$$ +\end{frame} + +\begin{frame}{Properties} +\begin{block}{Closed-form expression} +$$V(X) = C\sqrt{\det(\tr{X}X)}$$ +\end{block} + +\vspace{1cm} +\pause +\begin{block}{Supermodularity} +$$X\mapsto\log\big(V(X)\big)$$ is supermodular +\end{block} +\pause +\alert{$\Rightarrow$} we can apply the Shapley value theory! +\end{frame} + +\begin{frame}{Conclusion} + +\begin{itemize} +\item general framework to study the pricing problem +\item the case of the linear regression +\end{itemize} +\vspace{1cm} +\pause +Future directions +\begin{itemize} +\item different revenue models +\item fluid Shapley Value (Aumann-Shapley) for a continuum of players +\end{itemize} +\end{frame} +\end{document} \ No newline at end of file -- cgit v1.2.3-70-g09d2