\documentclass{article}
\usepackage[utf8]{inputenc}
\usepackage{amsmath,amsthm,amsfonts}
\usepackage{comment}
\newtheorem{lemma}{Lemma}
\newcommand{\var}{\mathop{\mathrm{Var}}}
\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]}
\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]}
\newcommand{\norm}[1]{\lVert#1\rVert}
\newcommand{\tr}[1]{#1^*}
\newcommand{\ip}[2]{\langle #1, #2 \rangle}
\newcommand{\mse}{\mathop{\mathrm{MSE}}}
\newcommand{\trace}{\mathop{\mathrm{tr}}}
\begin{document}

\section{Understanding the recommender system}

\subsection{General problem}

We already have a database $D_n$ of $n$ users. For each user $i$ we
have a set of $k$ explanatory variables (features), this is a vector
$x_i$.

The problem is the following: we are about to start an experiment where for
some of the users in the database, a variable of interest $y_i$ will
be revealed (it could be for example a medical survey, a website which
buys some data from another website). From the pairs $(x_i, y_i)$ that
we will acquire through the experiment, we are going to compute a
regression function $f_n$ which will allow us to predict for a new $x$
its associated $y$. The accuracy of this prediction will be measured
by the Mean Squared Error :
\begin{displaymath}
  \mse(f_n) = \expt{\big(f_n(x)-y\big)^2}
\end{displaymath}

We would like to understand the impact of the number of users who take
part in the experiment. Especially, how much does adding a user to the
experiment impacts the MSE.

\subsection{From the bivariate normal case to linear regression}
If $(X,Y)$ is drawn from a bivariate normal distribution with mean
vector $\mu$ and covariance matrix $\Sigma$. Then, one can
write:
\begin{displaymath}
  Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big)
\end{displaymath}
In this particular case, $\condexp{Y}{X}$ is a linear function of $X$: 
\begin{displaymath}
\condexp{Y}{X} = \alpha X + \beta  
\end{displaymath}
where $\alpha$ and $\beta$ can be expressed as a function of $\mu$ and
$\Sigma$. Writing $\varepsilon = Y-\condexp{Y}{X}$, it is easy to see
that $\expt{X\varepsilon}=0$. Furthermore $\varepsilon$ is also normally
distributed. Under these assumptions, it can be proven that the least
square estimator for $(\alpha,\beta)$ is optimal (it reaches the
Cramér-Rao bound).

\subsection{Linear regression}

We assume a linear model:
\begin{displaymath}
  y_i = \beta\cdot x_i + \varepsilon_i
\end{displaymath}

Where $\varepsilon_i$ is a normal random variable uncorrelated to
$x_i$. We also assume that $\var(\varepsilon_i)$ is constant
(homoscedasticity), $\sigma^2$ will denote the common value.

From the database we compute the least-squared estimator of $\beta$:
\begin{displaymath}
  \hat{\beta}_n = (\tr XX)^{-1}\tr XY
\end{displaymath}
where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory
variables)  whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$.

The regression function is simply the inner product of $\hat{\beta}_n$
and the new vector of explanatory variables $x$.

From there we can compute the MSE:
\begin{displaymath}
  \mse(D_n)
  =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2}
  = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2}
\end{displaymath}
where we used that
$\expt{x\varepsilon}=0$. The variance-covariance matrix of
$\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get:
\begin{displaymath}
  \mse(D_n)
  = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2
\end{displaymath}

\subsubsection*{Monotonicity}

We first want to study the impact of adding one observation to the
database. First, notice that:
\begin{displaymath}
  \tr XX = \sum_{i=1}^n x_i \tr x_i
\end{displaymath}
Let write $A= \tr X X$. Then, adding $x_0$ to the database will change
$A$ to $A+x_0\tr x_0$.

The following derivation will make an extensive use of the
Sherman-Morrisson formula \cite{sm} (which can be proven by direct
verification). For any invertible matrix $A$:
\begin{equation}\label{eq:inverse}
(A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y}
\end{equation}

$A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is
also positive semidefinite. We will denote by $\ip{x}{y}$ the scalar product defined by $A^{-1}$,
that is:
\begin{displaymath}
  \ip{x}{y} = \tr x A^{-1} y = \tr y A^{-1} x
\end{displaymath}

Using \eqref{eq:inverse} we get:
\begin{displaymath}
\begin{split}
  \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x
  A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\
& = \tr x A^{-1} x - \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2} 
\end{split}
\end{displaymath}

Thus:
\begin{equation}\label{eq:decrease}
\mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2\ip{x}{x_0}^2}{1+\norm{x_0}^2}
\end{equation}

\emph{Adding one observation to the database decreases the MSE.}

\subsubsection*{Submodularity}

Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained
from $D_n$ by adding some observations. We would like to show that
adding one observation to $D_m$ yields a smaller decrease in MSE than
adding the same observation to $D_n$:
\begin{displaymath}
  \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\})
\end{displaymath}

First, remark that it is necessary and sufficient to prove this property when $D_n$ and
$D_m$ differ by only one observation. Indeed, if the property is true
in general, then it will also be true when the two databases differ by
only one observation. Conversely, if the property is true when the two
databases differ by only one observation, then applying the property
repeatedly yields the general property. 

Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to
the database is:
\begin{displaymath}
  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
\end{displaymath}

If we denote by $z$ the additional observation present in $D_m$ and
not in $D_n$, then we would like to prove that:
\begin{displaymath}
  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
\geq   \frac{\sigma^2\left(\tr x (A+z\tr z)^{-1} x_0\right)^2}{1+\tr x_0 (A+z\tr z)^{-1} x_0}
\end{displaymath}

Using the same notations as before, this is equivalent
to:
\begin{displaymath}
  \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2}
\geq \frac{\left(\left(1+\norm{z}^2\right)\ip{x}{x_0}-\ip{x}{z}\ip{z}{x_0}\right)^2}
{\left(1+\norm{z}^2\right)\big((1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2\big)}
\end{displaymath}

By the Cauchy-Schwarz inequality:
\begin{displaymath}
 (1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2 > 0 
\end{displaymath}

Thus the previous inequality is consequently equivalent to:
\begin{multline*}
  \left(1+\norm{z}^2\right)^2\left(1+\norm{x_0}^2\right)\ip{x}{x_0}^2
-\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2\\
\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)^2\ip{x}{x_0}^2
+ \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2\\
-2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}
\end{multline*}

\begin{multline*}
2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}\\
\geq \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2
+ \left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2
\end{multline*}

This last inequality is not true in general. As soon as $x$, $x_0$ and
$z$ span a 2-dimensional space, it is possible that for example
$\ip{x}{x_0}$ and $\ip{x}{z}$ are positive and $\ip{z}{x_0}$
negative. Then the left term of the inequality will be negative and
cannot be greater than the right term which is always positive.

In the one-dimensional case, the inner product $\ip{x}{z}$ can be
written as $\lambda xz$ for some positive $\lambda$. Then the last
inequality becomes:
\begin{displaymath}
  2\geq \frac{\lambda z^2}{1+\lambda z^2} 
+ \frac{\lambda x_0^2}{1+\lambda x_0^2}
\end{displaymath}
which is trivially true (a more direct proof for the one-dimensional
case is of course possible).

In order to understand more precisely under which assumptions the
above inequality could become true, it is convenient to look at it
from the quadratic form perspective. Indeed this inequality can be
rewritten as:

\begin{equation}\label{eq-inequality}
\tr x B x \geq 0 
\end{equation}

with:
\begin{align*}
 B = &\, \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z}
 (x_0\tr z+z\tr x_0)\\
& -\ip{x_0}{z}^2\Big( \left(1+\norm{x_0}^2\right)z\tr z +  \left(1+\norm{z}^2\right)x_0\tr z\big)
\end{align*}

This quadratic form is degenerate, its kernel is $x_0^{\bot}\cap
z^\bot$ which is of dimension $k-2$.

\paragraph{Case when $\norm{x_0}=\norm{z}=1$} In this case, it suffices to study the quadratic form given by matrix
$B'$ with:
\begin{displaymath}
  B' = 2\ip{x_0}{z}(x_0\tr z+z\tr x_0) -\ip{x_0}{z}^2(z\tr z + x_0\tr x_0)
\end{displaymath}

Writing $a = \ip{x_0}{z}$, the two non-zero eigenvalues are:
\begin{align*}
  \lambda_1 & = -2a^3 + 2a^2 + 4a = -2a(a+1)(a-2)\\
  \lambda_2 & = 2a^3 + 2a^2 - 4a = 2a(a-1)(a+2)
\end{align*}

which are respectively associated with the eigenvectors:
\begin{align*}
  x_1 & = x_0+z\\
  x_2 & = x_0 - z
\end{align*}

By the Cauchy-Schwarz inequality, $a\in]-1,1[$, and the two
eigenvalues are of opposite sign on this interval. Thus inequality
\eqref{eq-inequality} does not hold for all $x$.

\paragraph{In expectation?} If we assume a prior knowledge on the
distribution of $x$, writing $\Sigma$ the variance-covariance matrix
of $x$ and $\mu$ its mean vector, then taking the expectation of
\eqref{eq-inequality} we get:
\begin{displaymath}
  \expt{\tr x B' x} = \trace(B'\Sigma) + \tr\mu B'\mu
\end{displaymath}

\nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11,lse}
\bibliographystyle{plain}
\bibliography{notes.bib}

\section*{Appendix}

\paragraph{Previous attempt at taming the submodularity}

The inequality only depends on the projection of $x$ on the plane
spanned by $x_0$ and $z$. Writing
\begin{displaymath}
  x = \lambda x_0 + \mu z + v,\quad\mathrm{with}\quad v \,\bot\,
  \mathrm{span}\{x_0, v\}
\end{displaymath}
the previous inequality becomes:
\begin{multline*}
2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z}
\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)
\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)\\
- \left(1+\norm{x_0}^2\right)\ip{x_0}{z}^2\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)^2\\
-
\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)^2\geq 0
\end{multline*}

By expanding and reordering the terms, we obtain a quadratic function
of $\lambda$ and $\mu$:
\begin{multline*}
  \lambda^2\ip{x_0}{z}^2\left[2\norm{x_0}^2+\norm{x_0}^4+\norm{x_0}^2\norm{z}^2
  +(1+\norm{x_0})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\
+\mu^2\ip{x_0}{z}^2\left[2\norm{z}^2+\norm{z}^4+\norm{x_0}^2\norm{z}^2
  +(1+\norm{z})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\
+2\lambda\mu\ip{x_0}{z}\Big[\norm{x_0}^2\norm{z}^2
\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\\
+\ip{x_0}{z}^2+\norm{x_0}^2\norm{z}^2
\big(1+\norm{x_0}^2+\norm{z}^2\big)\Big]\geq 0
\end{multline*}

This inequality will be true for all $\lambda$ and $\mu$ if and only
if the quadratic form is positive semidefinite. As its trace is
positive, this is equivalent to the positiveness of its determinant.


\end{document}