5 files changed, 463 insertions, 463 deletions
diff --git a/notes.tex b/notes.tex
index 8dc2c7b..92eabb5 100644
--- a/notes.tex
+++ b/notes.tex
@@ -3,6 +3,8 @@
 \usepackage{amsmath,amsthm,amsfonts}
 \usepackage{comment}
 \newtheorem{lemma}{Lemma}
+\newtheorem{fact}{Fact}
+\newtheorem{example}{Example}
 \newtheorem{prop}{Proposition}
 \newcommand{\var}{\mathop{\mathrm{Var}}}
 \newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]}
@@ -14,35 +16,102 @@
 \newcommand{\trace}{\mathop{\mathrm{tr}}}
 \begin{document}
 
+\section{Problem}
 
-\section{pomme}
+\begin{itemize}
+\item  database $D = (X_i)_{1\leq i\leq n}$
+\item $(X_i)_{1\leq i\leq n}\in\big(\mathbf{R^d}\big)^n$ sampled in an i.i.d
+    fashion from the probability distribution $\mu$
+\end{itemize}
 
-In this section, we will consider that we are given a \emph{universe}
-set $U$ such that all the sets we consider are subsets or $U$ and all
-functions defined on sets are defined on the power set of $U$,
-$\mathfrak{P}(U)$.
+We assume that there is a common variable $P$ which is the underlying
+phenomenon that we are interesting in, and all the variables in the database,
+are noisy realisations of this phenomenon:
+\begin{displaymath}
+    X_i = P + N_i
+\end{displaymath}
+where the $N_i$ are pairwise independent and also independent from $H$.
+
+
+We are interested in defining the $\emph{value}$ of a subset $S$ of the
+database $D$. The approaches to such a definition can be classified in two main
+families:
+\begin{itemize}
+    \item \emph{Information theoretic:} We use the entropy of $P$ as a measure
+        of the uncertainty. Then the value of a subset $S$ is the decrease of
+        entropy induced by revealing it:
+        \begin{displaymath}
+            V(S) = H(P) - H(P\,|\,S) 
+        \end{displaymath}
+    \item \emph{Statistical:} in this case you are trying to estimate the
+        phenomenon. The variance of the estimation is used as a measure of
+        the uncertainty. The value of a set $S$ can be defined as the average
+        reduction of the variance over all the points in the database once you
+        observe $S$:
+        \begin{displaymath}
+            V(S) = \frac{1}{|D|}\sum_{x\in D} \sigma_x^2 - \sigma_{x|A}^2
+        \end{displaymath}
+\end{itemize}
+
+Once you have defined a notion of value, there are several problems you can
+address:
+\begin{enumerate}
+    \item \emph{budget auctions:} the points in the database are associated
+        with users. Each user $i$ has a cost $c_i$ for revealing his data. You
+        are given a budget $B$ and you want to select a subset $S$ of users and
+        a payment scheme which is truthful. The value of the subset
+        $S$ should be as big as possible with the constraint that the payments
+        should stay within budget.
+    \item \emph{value sharing:} the goal is to find a value sharing method to
+        split the whole value of a subset $S$ among its participants.
+    \item \emph{k-best auctions:} this one is very similar to \emph{budget
+        auctions}, but instead of having a budget constraint, the constraint is
+        on the number of people in the subset $S$.
+\end{enumerate}
+
+These problems have already been studied, and optimal or near-optimal solution
+are already known when the value function is submodular. Fortunately, it can be
+shown that the information theoretic value defined above is submodular.
+
+\section{Submodularity}
+
+In this section, we will consider that we are given a \emph{universe} set $U$.
+A set function $f$ is a function defined on the power set of $U$, $\mathfrak{P}(U)$.
+
+A set function $f$ defined on $\mathfrak{P}(U)$ will be said \emph{increasing} if
+it is increasing with regards to inclusion, that is:
 
-A function $f$ defined on $\mathfrak{P}(U)$ will be said
-\emph{increasing} if it is increasing with regards to inclusion, that
-is:
 \begin{displaymath}
   \forall\,S\subseteq T,\quad f(S)\leq f(T)
 \end{displaymath}
+
 A \emph{decreasing} function on $\mathfrak{P}(U)$ is defined similarly.
 
+A set function $f$ defined on $\mathfrak{P}(U)$ is said to be
+\emph{submodular} if it verifies the diminishing returns property, that is,
+the marginal increments when adding a point to a set, is a set decreasing
+function. More formally, for any point $x$ in $U$, we can define the marginal
+increment of $f$ regarding $x$, it is the set function defined as:
+\begin{displaymath}
+    \Delta_x f(S) = f(S\cup\{x\}) - f(S)
+\end{displaymath}
+Then, $f$ is \emph{submodular} iff. for all $x$ in $U$, $\Delta_x f$ is a set
+decreasing function.
+
+Similarly, a \emph{supermodular} is a function whose marginal increments are set
+increasing functions.
 \begin{prop}
-  Let $R:\mathbf{R}\rightarrow \mathbf{R}$ be a decreasing concave
-  function and $f:\mathfrak{P}(U)\rightarrow\mathbf{R}$ be a
-  decreasing submodular function, then the composed function $R\circ
-  f$ is increasing and supermodular.
+  Let $R:\mathbf{R}\rightarrow \mathbf{R}$ be a decreasing concave function and
+  $f:\mathfrak{P}(U)\rightarrow\mathbf{R}$ be a decreasing submodular function,
+  then the composed function $R\circ f$ is increasing and supermodular.
 \end{prop}
 
 \begin{proof}
-  The increasingness of $R\circ f$ follows immediately from the
-  decreasingness of $R$ and $f$.
+  The increasingness of $R\circ f$ follows immediately from the decreasingness
+  of $R$ and $f$.
   
-  For the supermodularity, let $S$ and $T$ be two sets such that
-  $S\subseteq T$. By decreasingness of $f$, we have:
+  For the supermodularity, let $S$ and $T$ be two sets such that $S\subseteq
+  T$. By decreasingness of $f$, we have:
   \begin{displaymath}
     \forall\,V,\quad f(T)\leq f(S)\quad\mathrm{and}\quad f(T\cup V)\leq f(S\cup V)
   \end{displaymath}
@@ -90,288 +159,4 @@ A \emph{decreasing} function on $\mathfrak{P}(U)$ is defined similarly.
   which is exactly the supermodularity of $R\circ f$.
 \end{proof}
 
-
-
-\section{Understanding the recommender system}
-
-\subsection{General problem}
-
-We already have a database $D_n$ of $n$ users. For each user $i$ we
-have a set of $k$ explanatory variables (features), this is a vector
-$x_i$.
-
-The problem is the following: we are about to start an experiment where for
-some of the users in the database, a variable of interest $y_i$ will
-be revealed (it could be for example a medical survey, a website which
-buys some data from another website). From the pairs $(x_i, y_i)$ that
-we will acquire through the experiment, we are going to compute a
-regression function $f_n$ which will allow us to predict for a new $x$
-its associated $y$. The accuracy of this prediction will be measured
-by the Mean Squared Error :
-\begin{displaymath}
-  \mse(f_n) = \expt{\big(f_n(x)-y\big)^2}
-\end{displaymath}
-
-We would like to understand the impact of the number of users who take
-part in the experiment. Especially, how much does adding a user to the
-experiment impacts the MSE.
-
-\subsection{From the bivariate normal case to linear regression}
-If $(X,Y)$ is drawn from a bivariate normal distribution with mean
-vector $\mu$ and covariance matrix $\Sigma$. Then, one can
-write:
-\begin{displaymath}
-  Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big)
-\end{displaymath}
-In this particular case, $\condexp{Y}{X}$ is a linear function of $X$: 
-\begin{displaymath}
-\condexp{Y}{X} = \alpha X + \beta  
-\end{displaymath}
-where $\alpha$ and $\beta$ can be expressed as a function of $\mu$ and
-$\Sigma$. Writing $\varepsilon = Y-\condexp{Y}{X}$, it is easy to see
-that $\expt{X\varepsilon}=0$. Furthermore $\varepsilon$ is also normally
-distributed. Under these assumptions, it can be proven that the least
-square estimator for $(\alpha,\beta)$ is optimal (it reaches the
-Cramér-Rao bound).
-
-\subsection{Linear regression}
-
-We assume a linear model:
-\begin{displaymath}
-  y_i = \beta\cdot x_i + \varepsilon_i
-\end{displaymath}
-
-Where $\varepsilon_i$ is a normal random variable uncorrelated to
-$x_i$. We also assume that $\var(\varepsilon_i)$ is constant
-(homoscedasticity), $\sigma^2$ will denote the common value.
-
-From the database we compute the least-squared estimator of $\beta$:
-\begin{displaymath}
-  \hat{\beta}_n = (\tr XX)^{-1}\tr XY
-\end{displaymath}
-where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory
-variables)  whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$.
-
-The regression function is simply the inner product of $\hat{\beta}_n$
-and the new vector of explanatory variables $x$.
-
-From there we can compute the MSE:
-\begin{displaymath}
-  \mse(D_n)
-  =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2}
-  = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2}
-\end{displaymath}
-where we used that
-$\expt{x\varepsilon}=0$. The variance-covariance matrix of
-$\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get:
-\begin{displaymath}
-  \mse(D_n)
-  = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2
-\end{displaymath}
-
-\subsubsection*{Monotonicity}
-
-We first want to study the impact of adding one observation to the
-database. First, notice that:
-\begin{displaymath}
-  \tr XX = \sum_{i=1}^n x_i \tr x_i
-\end{displaymath}
-Let write $A= \tr X X$. Then, adding $x_0$ to the database will change
-$A$ to $A+x_0\tr x_0$.
-
-The following derivation will make an extensive use of the
-Sherman-Morrisson formula \cite{sm} (which can be proven by direct
-verification). For any invertible matrix $A$:
-\begin{equation}\label{eq:inverse}
-(A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y}
-\end{equation}
-
-$A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is
-also positive semidefinite. We will denote by $\ip{x}{y}$ the scalar product defined by $A^{-1}$,
-that is:
-\begin{displaymath}
-  \ip{x}{y} = \tr x A^{-1} y = \tr y A^{-1} x
-\end{displaymath}
-
-Using \eqref{eq:inverse} we get:
-\begin{displaymath}
-\begin{split}
-  \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x
-  A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\
-& = \tr x A^{-1} x - \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2} 
-\end{split}
-\end{displaymath}
-
-Thus:
-\begin{equation}\label{eq:decrease}
-\mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2\ip{x}{x_0}^2}{1+\norm{x_0}^2}
-\end{equation}
-
-\emph{Adding one observation to the database decreases the MSE.}
-
-\subsubsection*{Submodularity}
-
-Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained
-from $D_n$ by adding some observations. We would like to show that
-adding one observation to $D_m$ yields a smaller decrease in MSE than
-adding the same observation to $D_n$:
-\begin{displaymath}
-  \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\})
-\end{displaymath}
-
-First, remark that it is necessary and sufficient to prove this property when $D_n$ and
-$D_m$ differ by only one observation. Indeed, if the property is true
-in general, then it will also be true when the two databases differ by
-only one observation. Conversely, if the property is true when the two
-databases differ by only one observation, then applying the property
-repeatedly yields the general property. 
-
-Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to
-the database is:
-\begin{displaymath}
-  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
-\end{displaymath}
-
-If we denote by $z$ the additional observation present in $D_m$ and
-not in $D_n$, then we would like to prove that:
-\begin{displaymath}
-  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
-\geq   \frac{\sigma^2\left(\tr x (A+z\tr z)^{-1} x_0\right)^2}{1+\tr x_0 (A+z\tr z)^{-1} x_0}
-\end{displaymath}
-
-Using the same notations as before, this is equivalent
-to:
-\begin{displaymath}
-  \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2}
-\geq \frac{\left(\left(1+\norm{z}^2\right)\ip{x}{x_0}-\ip{x}{z}\ip{z}{x_0}\right)^2}
-{\left(1+\norm{z}^2\right)\big((1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2\big)}
-\end{displaymath}
-
-By the Cauchy-Schwarz inequality:
-\begin{displaymath}
- (1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2 > 0 
-\end{displaymath}
-
-Thus the previous inequality is consequently equivalent to:
-\begin{multline*}
-  \left(1+\norm{z}^2\right)^2\left(1+\norm{x_0}^2\right)\ip{x}{x_0}^2
--\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2\\
-\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)^2\ip{x}{x_0}^2
-+ \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2\\
--2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}
-\end{multline*}
-
-\begin{multline*}
-2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}\\
-\geq \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2
-+ \left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2
-\end{multline*}
-
-This last inequality is not true in general. As soon as $x$, $x_0$ and
-$z$ span a 2-dimensional space, it is possible that for example
-$\ip{x}{x_0}$ and $\ip{x}{z}$ are positive and $\ip{z}{x_0}$
-negative. Then the left term of the inequality will be negative and
-cannot be greater than the right term which is always positive.
-
-In the one-dimensional case, the inner product $\ip{x}{z}$ can be
-written as $\lambda xz$ for some positive $\lambda$. Then the last
-inequality becomes:
-\begin{displaymath}
-  2\geq \frac{\lambda z^2}{1+\lambda z^2} 
-+ \frac{\lambda x_0^2}{1+\lambda x_0^2}
-\end{displaymath}
-which is trivially true (a more direct proof for the one-dimensional
-case is of course possible).
-
-In order to understand more precisely under which assumptions the
-above inequality could become true, it is convenient to look at it
-from the quadratic form perspective. Indeed this inequality can be
-rewritten as:
-
-\begin{equation}\label{eq-inequality}
-\tr x B x \geq 0 
-\end{equation}
-
-with:
-\begin{align*}
- B = &\, \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z}
- (x_0\tr z+z\tr x_0)\\
-& -\ip{x_0}{z}^2\Big( \left(1+\norm{x_0}^2\right)z\tr z +  \left(1+\norm{z}^2\right)x_0\tr z\big)
-\end{align*}
-
-This quadratic form is degenerate, its kernel is $x_0^{\bot}\cap
-z^\bot$ which is of dimension $k-2$.
-
-\paragraph{Case when $\norm{x_0}=\norm{z}=1$} In this case, it suffices to study the quadratic form given by matrix
-$B'$ with:
-\begin{displaymath}
-  B' = 2\ip{x_0}{z}(x_0\tr z+z\tr x_0) -\ip{x_0}{z}^2(z\tr z + x_0\tr x_0)
-\end{displaymath}
-
-Writing $a = \ip{x_0}{z}$, the two non-zero eigenvalues are:
-\begin{align*}
-  \lambda_1 & = -2a^3 + 2a^2 + 4a = -2a(a+1)(a-2)\\
-  \lambda_2 & = 2a^3 + 2a^2 - 4a = 2a(a-1)(a+2)
-\end{align*}
-
-which are respectively associated with the eigenvectors:
-\begin{align*}
-  x_1 & = x_0+z\\
-  x_2 & = x_0 - z
-\end{align*}
-
-By the Cauchy-Schwarz inequality, $a\in]-1,1[$, and the two
-eigenvalues are of opposite sign on this interval. Thus inequality
-\eqref{eq-inequality} does not hold for all $x$.
-
-\paragraph{In expectation?} If we assume a prior knowledge on the
-distribution of $x$, writing $\Sigma$ the variance-covariance matrix
-of $x$ and $\mu$ its mean vector, then taking the expectation of
-\eqref{eq-inequality} we get:
-\begin{displaymath}
-  \expt{\tr x B' x} = \trace(B'\Sigma) + \tr\mu B'\mu
-\end{displaymath}
-
-\nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11,lse}
-\bibliographystyle{plain}
-\bibliography{notes.bib}
-
-\section*{Appendix}
-
-\paragraph{Previous attempt at taming the submodularity}
-
-The inequality only depends on the projection of $x$ on the plane
-spanned by $x_0$ and $z$. Writing
-\begin{displaymath}
-  x = \lambda x_0 + \mu z + v,\quad\mathrm{with}\quad v \,\bot\,
-  \mathrm{span}\{x_0, v\}
-\end{displaymath}
-the previous inequality becomes:
-\begin{multline*}
-2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z}
-\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)
-\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)\\
-- \left(1+\norm{x_0}^2\right)\ip{x_0}{z}^2\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)^2\\
--
-\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)^2\geq 0
-\end{multline*}
-
-By expanding and reordering the terms, we obtain a quadratic function
-of $\lambda$ and $\mu$:
-\begin{multline*}
-  \lambda^2\ip{x_0}{z}^2\left[2\norm{x_0}^2+\norm{x_0}^4+\norm{x_0}^2\norm{z}^2
-  +(1+\norm{x_0})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\
-+\mu^2\ip{x_0}{z}^2\left[2\norm{z}^2+\norm{z}^4+\norm{x_0}^2\norm{z}^2
-  +(1+\norm{z})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\
-+2\lambda\mu\ip{x_0}{z}\Big[\norm{x_0}^2\norm{z}^2
-\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\\
-+\ip{x_0}{z}^2+\norm{x_0}^2\norm{z}^2
-\big(1+\norm{x_0}^2+\norm{z}^2\big)\Big]\geq 0
-\end{multline*}
-
-This inequality will be true for all $\lambda$ and $\mu$ if and only
-if the quadratic form is positive semidefinite. As its trace is
-positive, this is equivalent to the positiveness of its determinant.
-
-
-\end{document}
-\ No newline at end of file
+\end{document}
diff --git a/notes2.tex b/notes2.tex
deleted file mode 100644
index 92eabb5..0000000
--- a/notes2.tex
+++ /dev/null
@@ -1,162 +0,0 @@
-\documentclass{article}
-\usepackage[utf8]{inputenc}
-\usepackage{amsmath,amsthm,amsfonts}
-\usepackage{comment}
-\newtheorem{lemma}{Lemma}
-\newtheorem{fact}{Fact}
-\newtheorem{example}{Example}
-\newtheorem{prop}{Proposition}
-\newcommand{\var}{\mathop{\mathrm{Var}}}
-\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]}
-\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]}
-\newcommand{\norm}[1]{\lVert#1\rVert}
-\newcommand{\tr}[1]{#1^*}
-\newcommand{\ip}[2]{\langle #1, #2 \rangle}
-\newcommand{\mse}{\mathop{\mathrm{MSE}}}
-\newcommand{\trace}{\mathop{\mathrm{tr}}}
-\begin{document}
-
-\section{Problem}
-
-\begin{itemize}
-\item  database $D = (X_i)_{1\leq i\leq n}$
-\item $(X_i)_{1\leq i\leq n}\in\big(\mathbf{R^d}\big)^n$ sampled in an i.i.d
-    fashion from the probability distribution $\mu$
-\end{itemize}
-
-We assume that there is a common variable $P$ which is the underlying
-phenomenon that we are interesting in, and all the variables in the database,
-are noisy realisations of this phenomenon:
-\begin{displaymath}
-    X_i = P + N_i
-\end{displaymath}
-where the $N_i$ are pairwise independent and also independent from $H$.
-
-
-We are interested in defining the $\emph{value}$ of a subset $S$ of the
-database $D$. The approaches to such a definition can be classified in two main
-families:
-\begin{itemize}
-    \item \emph{Information theoretic:} We use the entropy of $P$ as a measure
-        of the uncertainty. Then the value of a subset $S$ is the decrease of
-        entropy induced by revealing it:
-        \begin{displaymath}
-            V(S) = H(P) - H(P\,|\,S) 
-        \end{displaymath}
-    \item \emph{Statistical:} in this case you are trying to estimate the
-        phenomenon. The variance of the estimation is used as a measure of
-        the uncertainty. The value of a set $S$ can be defined as the average
-        reduction of the variance over all the points in the database once you
-        observe $S$:
-        \begin{displaymath}
-            V(S) = \frac{1}{|D|}\sum_{x\in D} \sigma_x^2 - \sigma_{x|A}^2
-        \end{displaymath}
-\end{itemize}
-
-Once you have defined a notion of value, there are several problems you can
-address:
-\begin{enumerate}
-    \item \emph{budget auctions:} the points in the database are associated
-        with users. Each user $i$ has a cost $c_i$ for revealing his data. You
-        are given a budget $B$ and you want to select a subset $S$ of users and
-        a payment scheme which is truthful. The value of the subset
-        $S$ should be as big as possible with the constraint that the payments
-        should stay within budget.
-    \item \emph{value sharing:} the goal is to find a value sharing method to
-        split the whole value of a subset $S$ among its participants.
-    \item \emph{k-best auctions:} this one is very similar to \emph{budget
-        auctions}, but instead of having a budget constraint, the constraint is
-        on the number of people in the subset $S$.
-\end{enumerate}
-
-These problems have already been studied, and optimal or near-optimal solution
-are already known when the value function is submodular. Fortunately, it can be
-shown that the information theoretic value defined above is submodular.
-
-\section{Submodularity}
-
-In this section, we will consider that we are given a \emph{universe} set $U$.
-A set function $f$ is a function defined on the power set of $U$, $\mathfrak{P}(U)$.
-
-A set function $f$ defined on $\mathfrak{P}(U)$ will be said \emph{increasing} if
-it is increasing with regards to inclusion, that is:
-
-\begin{displaymath}
-  \forall\,S\subseteq T,\quad f(S)\leq f(T)
-\end{displaymath}
-
-A \emph{decreasing} function on $\mathfrak{P}(U)$ is defined similarly.
-
-A set function $f$ defined on $\mathfrak{P}(U)$ is said to be
-\emph{submodular} if it verifies the diminishing returns property, that is,
-the marginal increments when adding a point to a set, is a set decreasing
-function. More formally, for any point $x$ in $U$, we can define the marginal
-increment of $f$ regarding $x$, it is the set function defined as:
-\begin{displaymath}
-    \Delta_x f(S) = f(S\cup\{x\}) - f(S)
-\end{displaymath}
-Then, $f$ is \emph{submodular} iff. for all $x$ in $U$, $\Delta_x f$ is a set
-decreasing function.
-
-Similarly, a \emph{supermodular} is a function whose marginal increments are set
-increasing functions.
-\begin{prop}
-  Let $R:\mathbf{R}\rightarrow \mathbf{R}$ be a decreasing concave function and
-  $f:\mathfrak{P}(U)\rightarrow\mathbf{R}$ be a decreasing submodular function,
-  then the composed function $R\circ f$ is increasing and supermodular.
-\end{prop}
-
-\begin{proof}
-  The increasingness of $R\circ f$ follows immediately from the decreasingness
-  of $R$ and $f$.
-  
-  For the supermodularity, let $S$ and $T$ be two sets such that $S\subseteq
-  T$. By decreasingness of $f$, we have:
-  \begin{displaymath}
-    \forall\,V,\quad f(T)\leq f(S)\quad\mathrm{and}\quad f(T\cup V)\leq f(S\cup V)
-  \end{displaymath}
-  
-  Thus, by concavity of $R$:
-  \begin{displaymath}\label{eq:base}
-    \forall\,V,\quad\frac{R\big(f(S)\big)-R\big(f(S\cup V)\big)}{f(S)-f(S\cup V)}
-    \leq\frac{R\big(f(T)\big)-R\big(f(T\cup V)\big)}{f(T)-f(T\cup V)}
-  \end{displaymath}
-  
-  $f$ is decreasing, so multiplying this last inequality by
-  $f(S)-f(S\cup V)$ and $f(T)-f(T\cup V)$ yields:
-  \begin{multline}
-    \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(T)-f(T\cup V)\big)\\
-    \leq \Big(R\big(f(T)\big)-R\big(f(T\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)
-  \end{multline}
-
-  $f$ is submodular, so:
-  \begin{displaymath}
-    f(T\cup V)-f(T)\leq f(S\cup V) - f(S)
-  \end{displaymath}
-  
-  $R\circ f$ is increasing, so:
-  \begin{displaymath}
-    R\big(f(S)\big)-R\big(f(S\cup V)\big)\leq 0
-  \end{displaymath}
-
-  By combining the two previous inequalities, we get:
-    \begin{multline*}
-    \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)\\
-    \leq \Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(T)-f(T\cup V)\big)
-  \end{multline*}
-
-  Injecting this last inequality into \eqref{eq:base} gives:
-  \begin{multline*}
-    \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)\\
-    \leq \Big(R\big(f(T)\big)-R\big(f(T\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)
-  \end{multline*}
-
-  Dividing left and right by $f(S)-f(S\cup V)$ yields:
-  \begin{displaymath}
-    \forall V,\quad R\big(f(S)\big)-R\big(f(S\cup V)\big)
-    \leq R\big(f(T)\big)-R\big(f(T\cup V)\big)
-  \end{displaymath}
-  which is exactly the supermodularity of $R\circ f$.
-\end{proof}
-
-\end{document}
diff --git a/old_notes.tex b/old_notes.tex
new file mode 100644
index 0000000..8dc2c7b
--- /dev/null
+++ b/old_notes.tex
@@ -0,0 +1,377 @@
+\documentclass{article}
+\usepackage[utf8]{inputenc}
+\usepackage{amsmath,amsthm,amsfonts}
+\usepackage{comment}
+\newtheorem{lemma}{Lemma}
+\newtheorem{prop}{Proposition}
+\newcommand{\var}{\mathop{\mathrm{Var}}}
+\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]}
+\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]}
+\newcommand{\norm}[1]{\lVert#1\rVert}
+\newcommand{\tr}[1]{#1^*}
+\newcommand{\ip}[2]{\langle #1, #2 \rangle}
+\newcommand{\mse}{\mathop{\mathrm{MSE}}}
+\newcommand{\trace}{\mathop{\mathrm{tr}}}
+\begin{document}
+
+
+\section{pomme}
+
+In this section, we will consider that we are given a \emph{universe}
+set $U$ such that all the sets we consider are subsets or $U$ and all
+functions defined on sets are defined on the power set of $U$,
+$\mathfrak{P}(U)$.
+
+A function $f$ defined on $\mathfrak{P}(U)$ will be said
+\emph{increasing} if it is increasing with regards to inclusion, that
+is:
+\begin{displaymath}
+  \forall\,S\subseteq T,\quad f(S)\leq f(T)
+\end{displaymath}
+A \emph{decreasing} function on $\mathfrak{P}(U)$ is defined similarly.
+
+\begin{prop}
+  Let $R:\mathbf{R}\rightarrow \mathbf{R}$ be a decreasing concave
+  function and $f:\mathfrak{P}(U)\rightarrow\mathbf{R}$ be a
+  decreasing submodular function, then the composed function $R\circ
+  f$ is increasing and supermodular.
+\end{prop}
+
+\begin{proof}
+  The increasingness of $R\circ f$ follows immediately from the
+  decreasingness of $R$ and $f$.
+  
+  For the supermodularity, let $S$ and $T$ be two sets such that
+  $S\subseteq T$. By decreasingness of $f$, we have:
+  \begin{displaymath}
+    \forall\,V,\quad f(T)\leq f(S)\quad\mathrm{and}\quad f(T\cup V)\leq f(S\cup V)
+  \end{displaymath}
+  
+  Thus, by concavity of $R$:
+  \begin{displaymath}\label{eq:base}
+    \forall\,V,\quad\frac{R\big(f(S)\big)-R\big(f(S\cup V)\big)}{f(S)-f(S\cup V)}
+    \leq\frac{R\big(f(T)\big)-R\big(f(T\cup V)\big)}{f(T)-f(T\cup V)}
+  \end{displaymath}
+  
+  $f$ is decreasing, so multiplying this last inequality by
+  $f(S)-f(S\cup V)$ and $f(T)-f(T\cup V)$ yields:
+  \begin{multline}
+    \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(T)-f(T\cup V)\big)\\
+    \leq \Big(R\big(f(T)\big)-R\big(f(T\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)
+  \end{multline}
+
+  $f$ is submodular, so:
+  \begin{displaymath}
+    f(T\cup V)-f(T)\leq f(S\cup V) - f(S)
+  \end{displaymath}
+  
+  $R\circ f$ is increasing, so:
+  \begin{displaymath}
+    R\big(f(S)\big)-R\big(f(S\cup V)\big)\leq 0
+  \end{displaymath}
+
+  By combining the two previous inequalities, we get:
+    \begin{multline*}
+    \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)\\
+    \leq \Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(T)-f(T\cup V)\big)
+  \end{multline*}
+
+  Injecting this last inequality into \eqref{eq:base} gives:
+  \begin{multline*}
+    \forall V,\quad\Big(R\big(f(S)\big)-R\big(f(S\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)\\
+    \leq \Big(R\big(f(T)\big)-R\big(f(T\cup V)\big)\Big)\big(f(S)-f(S\cup V)\big)
+  \end{multline*}
+
+  Dividing left and right by $f(S)-f(S\cup V)$ yields:
+  \begin{displaymath}
+    \forall V,\quad R\big(f(S)\big)-R\big(f(S\cup V)\big)
+    \leq R\big(f(T)\big)-R\big(f(T\cup V)\big)
+  \end{displaymath}
+  which is exactly the supermodularity of $R\circ f$.
+\end{proof}
+
+
+
+\section{Understanding the recommender system}
+
+\subsection{General problem}
+
+We already have a database $D_n$ of $n$ users. For each user $i$ we
+have a set of $k$ explanatory variables (features), this is a vector
+$x_i$.
+
+The problem is the following: we are about to start an experiment where for
+some of the users in the database, a variable of interest $y_i$ will
+be revealed (it could be for example a medical survey, a website which
+buys some data from another website). From the pairs $(x_i, y_i)$ that
+we will acquire through the experiment, we are going to compute a
+regression function $f_n$ which will allow us to predict for a new $x$
+its associated $y$. The accuracy of this prediction will be measured
+by the Mean Squared Error :
+\begin{displaymath}
+  \mse(f_n) = \expt{\big(f_n(x)-y\big)^2}
+\end{displaymath}
+
+We would like to understand the impact of the number of users who take
+part in the experiment. Especially, how much does adding a user to the
+experiment impacts the MSE.
+
+\subsection{From the bivariate normal case to linear regression}
+If $(X,Y)$ is drawn from a bivariate normal distribution with mean
+vector $\mu$ and covariance matrix $\Sigma$. Then, one can
+write:
+\begin{displaymath}
+  Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big)
+\end{displaymath}
+In this particular case, $\condexp{Y}{X}$ is a linear function of $X$: 
+\begin{displaymath}
+\condexp{Y}{X} = \alpha X + \beta  
+\end{displaymath}
+where $\alpha$ and $\beta$ can be expressed as a function of $\mu$ and
+$\Sigma$. Writing $\varepsilon = Y-\condexp{Y}{X}$, it is easy to see
+that $\expt{X\varepsilon}=0$. Furthermore $\varepsilon$ is also normally
+distributed. Under these assumptions, it can be proven that the least
+square estimator for $(\alpha,\beta)$ is optimal (it reaches the
+Cramér-Rao bound).
+
+\subsection{Linear regression}
+
+We assume a linear model:
+\begin{displaymath}
+  y_i = \beta\cdot x_i + \varepsilon_i
+\end{displaymath}
+
+Where $\varepsilon_i$ is a normal random variable uncorrelated to
+$x_i$. We also assume that $\var(\varepsilon_i)$ is constant
+(homoscedasticity), $\sigma^2$ will denote the common value.
+
+From the database we compute the least-squared estimator of $\beta$:
+\begin{displaymath}
+  \hat{\beta}_n = (\tr XX)^{-1}\tr XY
+\end{displaymath}
+where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory
+variables)  whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$.
+
+The regression function is simply the inner product of $\hat{\beta}_n$
+and the new vector of explanatory variables $x$.
+
+From there we can compute the MSE:
+\begin{displaymath}
+  \mse(D_n)
+  =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2}
+  = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2}
+\end{displaymath}
+where we used that
+$\expt{x\varepsilon}=0$. The variance-covariance matrix of
+$\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get:
+\begin{displaymath}
+  \mse(D_n)
+  = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2
+\end{displaymath}
+
+\subsubsection*{Monotonicity}
+
+We first want to study the impact of adding one observation to the
+database. First, notice that:
+\begin{displaymath}
+  \tr XX = \sum_{i=1}^n x_i \tr x_i
+\end{displaymath}
+Let write $A= \tr X X$. Then, adding $x_0$ to the database will change
+$A$ to $A+x_0\tr x_0$.
+
+The following derivation will make an extensive use of the
+Sherman-Morrisson formula \cite{sm} (which can be proven by direct
+verification). For any invertible matrix $A$:
+\begin{equation}\label{eq:inverse}
+(A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y}
+\end{equation}
+
+$A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is
+also positive semidefinite. We will denote by $\ip{x}{y}$ the scalar product defined by $A^{-1}$,
+that is:
+\begin{displaymath}
+  \ip{x}{y} = \tr x A^{-1} y = \tr y A^{-1} x
+\end{displaymath}
+
+Using \eqref{eq:inverse} we get:
+\begin{displaymath}
+\begin{split}
+  \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x
+  A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\
+& = \tr x A^{-1} x - \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2} 
+\end{split}
+\end{displaymath}
+
+Thus:
+\begin{equation}\label{eq:decrease}
+\mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2\ip{x}{x_0}^2}{1+\norm{x_0}^2}
+\end{equation}
+
+\emph{Adding one observation to the database decreases the MSE.}
+
+\subsubsection*{Submodularity}
+
+Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained
+from $D_n$ by adding some observations. We would like to show that
+adding one observation to $D_m$ yields a smaller decrease in MSE than
+adding the same observation to $D_n$:
+\begin{displaymath}
+  \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\})
+\end{displaymath}
+
+First, remark that it is necessary and sufficient to prove this property when $D_n$ and
+$D_m$ differ by only one observation. Indeed, if the property is true
+in general, then it will also be true when the two databases differ by
+only one observation. Conversely, if the property is true when the two
+databases differ by only one observation, then applying the property
+repeatedly yields the general property. 
+
+Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to
+the database is:
+\begin{displaymath}
+  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
+\end{displaymath}
+
+If we denote by $z$ the additional observation present in $D_m$ and
+not in $D_n$, then we would like to prove that:
+\begin{displaymath}
+  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
+\geq   \frac{\sigma^2\left(\tr x (A+z\tr z)^{-1} x_0\right)^2}{1+\tr x_0 (A+z\tr z)^{-1} x_0}
+\end{displaymath}
+
+Using the same notations as before, this is equivalent
+to:
+\begin{displaymath}
+  \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2}
+\geq \frac{\left(\left(1+\norm{z}^2\right)\ip{x}{x_0}-\ip{x}{z}\ip{z}{x_0}\right)^2}
+{\left(1+\norm{z}^2\right)\big((1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2\big)}
+\end{displaymath}
+
+By the Cauchy-Schwarz inequality:
+\begin{displaymath}
+ (1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2 > 0 
+\end{displaymath}
+
+Thus the previous inequality is consequently equivalent to:
+\begin{multline*}
+  \left(1+\norm{z}^2\right)^2\left(1+\norm{x_0}^2\right)\ip{x}{x_0}^2
+-\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2\\
+\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)^2\ip{x}{x_0}^2
++ \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2\\
+-2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}
+\end{multline*}
+
+\begin{multline*}
+2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}\\
+\geq \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2
++ \left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2
+\end{multline*}
+
+This last inequality is not true in general. As soon as $x$, $x_0$ and
+$z$ span a 2-dimensional space, it is possible that for example
+$\ip{x}{x_0}$ and $\ip{x}{z}$ are positive and $\ip{z}{x_0}$
+negative. Then the left term of the inequality will be negative and
+cannot be greater than the right term which is always positive.
+
+In the one-dimensional case, the inner product $\ip{x}{z}$ can be
+written as $\lambda xz$ for some positive $\lambda$. Then the last
+inequality becomes:
+\begin{displaymath}
+  2\geq \frac{\lambda z^2}{1+\lambda z^2} 
++ \frac{\lambda x_0^2}{1+\lambda x_0^2}
+\end{displaymath}
+which is trivially true (a more direct proof for the one-dimensional
+case is of course possible).
+
+In order to understand more precisely under which assumptions the
+above inequality could become true, it is convenient to look at it
+from the quadratic form perspective. Indeed this inequality can be
+rewritten as:
+
+\begin{equation}\label{eq-inequality}
+\tr x B x \geq 0 
+\end{equation}
+
+with:
+\begin{align*}
+ B = &\, \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z}
+ (x_0\tr z+z\tr x_0)\\
+& -\ip{x_0}{z}^2\Big( \left(1+\norm{x_0}^2\right)z\tr z +  \left(1+\norm{z}^2\right)x_0\tr z\big)
+\end{align*}
+
+This quadratic form is degenerate, its kernel is $x_0^{\bot}\cap
+z^\bot$ which is of dimension $k-2$.
+
+\paragraph{Case when $\norm{x_0}=\norm{z}=1$} In this case, it suffices to study the quadratic form given by matrix
+$B'$ with:
+\begin{displaymath}
+  B' = 2\ip{x_0}{z}(x_0\tr z+z\tr x_0) -\ip{x_0}{z}^2(z\tr z + x_0\tr x_0)
+\end{displaymath}
+
+Writing $a = \ip{x_0}{z}$, the two non-zero eigenvalues are:
+\begin{align*}
+  \lambda_1 & = -2a^3 + 2a^2 + 4a = -2a(a+1)(a-2)\\
+  \lambda_2 & = 2a^3 + 2a^2 - 4a = 2a(a-1)(a+2)
+\end{align*}
+
+which are respectively associated with the eigenvectors:
+\begin{align*}
+  x_1 & = x_0+z\\
+  x_2 & = x_0 - z
+\end{align*}
+
+By the Cauchy-Schwarz inequality, $a\in]-1,1[$, and the two
+eigenvalues are of opposite sign on this interval. Thus inequality
+\eqref{eq-inequality} does not hold for all $x$.
+
+\paragraph{In expectation?} If we assume a prior knowledge on the
+distribution of $x$, writing $\Sigma$ the variance-covariance matrix
+of $x$ and $\mu$ its mean vector, then taking the expectation of
+\eqref{eq-inequality} we get:
+\begin{displaymath}
+  \expt{\tr x B' x} = \trace(B'\Sigma) + \tr\mu B'\mu
+\end{displaymath}
+
+\nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11,lse}
+\bibliographystyle{plain}
+\bibliography{notes.bib}
+
+\section*{Appendix}
+
+\paragraph{Previous attempt at taming the submodularity}
+
+The inequality only depends on the projection of $x$ on the plane
+spanned by $x_0$ and $z$. Writing
+\begin{displaymath}
+  x = \lambda x_0 + \mu z + v,\quad\mathrm{with}\quad v \,\bot\,
+  \mathrm{span}\{x_0, v\}
+\end{displaymath}
+the previous inequality becomes:
+\begin{multline*}
+2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z}
+\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)
+\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)\\
+- \left(1+\norm{x_0}^2\right)\ip{x_0}{z}^2\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)^2\\
+-
+\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)^2\geq 0
+\end{multline*}
+
+By expanding and reordering the terms, we obtain a quadratic function
+of $\lambda$ and $\mu$:
+\begin{multline*}
+  \lambda^2\ip{x_0}{z}^2\left[2\norm{x_0}^2+\norm{x_0}^4+\norm{x_0}^2\norm{z}^2
+  +(1+\norm{x_0})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\
++\mu^2\ip{x_0}{z}^2\left[2\norm{z}^2+\norm{z}^4+\norm{x_0}^2\norm{z}^2
+  +(1+\norm{z})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\
++2\lambda\mu\ip{x_0}{z}\Big[\norm{x_0}^2\norm{z}^2
+\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\\
++\ip{x_0}{z}^2+\norm{x_0}^2\norm{z}^2
+\big(1+\norm{x_0}^2+\norm{z}^2\big)\Big]\geq 0
+\end{multline*}
+
+This inequality will be true for all $\lambda$ and $\mu$ if and only
+if the quadratic form is positive semidefinite. As its trace is
+positive, this is equivalent to the positiveness of its determinant.
+
+
+\end{document}
+\ No newline at end of file
diff --git a/schema.pdf b/slides/schema.pdf
index c44eb38..c44eb38 100644
--- a/schema.pdf
+++ b/slides/schema.pdf
diff --git a/slides.tex b/slides/slides.tex
index ec2e5be..ec2e5be 100644
--- a/slides.tex
+++ b/slides/slides.tex