\documentclass{article}
\usepackage[utf8]{inputenc}
\usepackage{amsmath,amsthm,amsfonts}
\newtheorem{lemma}{Lemma}
\newcommand{\var}{\mathop{\mathrm{Var}}}
\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]}
\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]}
\newcommand{\norm}[1]{\lVert#1\rVert}
\newcommand{\tr}[1]{#1^*}
\newcommand{\mse}{\mathop{\mathrm{MSE}}}
\begin{document}

\section{Understanding the recommender system}

\subsection{General problem}

We have a database $D_n$ consisting of $n$ pairs $(x_i,y_i)_{1\leq
i\leq n}$. $x_i$ is a vector of explanatory variables (could be only
one number in the bivariate case) and $y_i$ is the variable of
interest.

From the database we want to compute a regression function $f_n$ such
that we are able to compute the variable of interest $y$ from a new
vector of explanatory variables $x$.

The cost of the regression error will be measured by the MSE:
\begin{displaymath}
  \mathrm{MSE}(f_n) = \expt{\big(f_n(x)-y\big)^2}
\end{displaymath}

The general goal is to understand how the size of the database impacts
the MSE of the derived regression function.

\subsection{From the bivariate normal case to linear regression}
If $(X,Y)$ is drawn from a bivariate normal distribution. Then, one can
write:
\begin{displaymath}
  Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big)
\end{displaymath}

\subsection{Linear regression}

We assume a linear model:
\begin{displaymath}
  y_i = \beta\cdot x_i + \varepsilon_i
\end{displaymath}

Where $\varepsilon_i$ is a normal random variable uncorrelated to
$x_i$. We also assume that $\var(\varepsilon_i)$ is constant
(homoscedasticity), $\sigma^2$ will denote the common value.

From the database we compute the least-squared estimator of $\beta$:
\begin{displaymath}
  \hat{\beta}_n = (\tr XX)^{-1}\tr XY
\end{displaymath}
where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory
variables)  whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$.

The regression function is simply the inner product of $\hat{\beta}_n$
and the new vector of explanatory variables $x$.

From there we can compute the MSE:
\begin{displaymath}
  \mse(D_n)
  =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2}
  = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2}
\end{displaymath}
where we used that
$\expt{x\varepsilon}=0$. The variance-covariance matrix of
$\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get:
\begin{displaymath}
  \mse(D_n)
  = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2
\end{displaymath}

\subsubsection*{Monotonicity}

We first want to study the impact of adding one observation to the
database. First, notice that:
\begin{displaymath}
  \tr XX = \sum_{i=1}^n x_i \tr x_i
\end{displaymath}
Let write $A= \tr X X$. Then, adding $x_0$ to the database will change
$A$ to $A+x_0\tr x_0$.

The following derivation will make an extensive use of the following
formula found in \cite{inverse} (which can be proven by direct
verification). For any invertible matrix $A$:
\begin{equation}\label{eq:inverse}
(A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y}
\end{equation}

$A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is
also positive semidefinite. We will denote by $x\cdot y$ the scalar product defined by $A^{-1}$,
that is:
\begin{displaymath}
  x\cdot y = \tr x A^{-1} y = \tr y A^{-1} x
\end{displaymath}

Using \eqref{eq:inverse} we get:
\begin{displaymath}
\begin{split}
  \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x
  A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\
& = \tr x A^{-1} x - \frac{(x\cdot x_0)^2}{1+\norm{x_0}^2} 
\end{split}
\end{displaymath}

Thus:
\begin{equation}\label{eq:decrease}
\mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2(x\cdot x_0)^2}{1+\norm{x_0}^2}
\end{equation}

\emph{Adding one observation to the database decreases the MSE.}

\subsubsection*{Submodularity}

Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained
from $D_n$ by adding some observations. We would like to show that
adding one observation to $D_m$ yields a smaller decrease in MSE than
adding the same observation to $D_n$:
\begin{displaymath}
  \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\})
\end{displaymath}

First remark that it is necessary and sufficient to prove this property when $D_n$ and
$D_m$ differ by only one observation. Indeed, if the property is true
in general, then it will also be true when the two databases differ by
only one observation. Conversely, if the property is true when the two
databases differ by only one observation, then applying the property
repeatedly yields the general property. 

Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to
the database is:
\begin{displaymath}
  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
\end{displaymath}

If we denote by $y$ the additional observation present in $D_m$ and
not in $D_n$, then we would like to prove that:
\begin{displaymath}
  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
\geq   \frac{\sigma^2\left(\tr x (A+y\tr y)^{-1} x_0\right)^2}{1+\tr x_0 (A+y\tr y)^{-1} x_0}
\end{displaymath}

Using the same notations as before, this is equivalent
to:
\begin{displaymath}
  \frac{(x\cdot x_0)^2}{1+\norm{x_0}^2}
\geq \frac{\left(\left(1+\norm{y}^2\right)(x\cdot x_0)-(x\cdot
  y)(y\cdot x_0)\right)^2}
{\left(1+\norm{y}^2\right)\big((1+\norm{y}^2)(1+\norm{x_0}^2)-(x_0\cdot
y)^2\big)}
\end{displaymath}

By the Cauchy-Schwarz inequality:
\begin{displaymath}
 (1+\norm{y}^2)(1+\norm{x_0}^2)-(x_0\cdot
y)^2 \geq 0 
\end{displaymath}

Thus the previous inequality is consecutively equivalent to:
\begin{multline*}
  \left(1+\norm{y}^2\right)^2\left(1+\norm{x_0}^2\right)(x\cdot x_0)^2
-\left(1+\norm{y}^2\right)(x_0\cdot y)^2(x\cdot x_0)^2\\
\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)^2(x\cdot
x_0)^2
+ \left(1+\norm{x_0}^2\right)(x\cdot y)^2(y\cdot x_0)^2\\
-2\left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)(x\cdot
x_0)(x\cdot  y)(y\cdot x_0)
\end{multline*}

\begin{multline*}
2\left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)(x\cdot
x_0)(x\cdot  y)(y\cdot x_0)\\
\geq \left(1+\norm{x_0}^2\right)(x\cdot y)^2(y\cdot x_0)^2
+ \left(1+\norm{y}^2\right)(x_0\cdot y)^2(x\cdot x_0)^2
\end{multline*}

This last inequality is not true in general. As soon as $x$, $y$ and
$z$ span a 2-dimensional space, it is possible that for example
$(x\cdot x_0)$ and $(x\cdot y)$ are positive and $(y\cdot x_0)$
negative. Then the left term of the inequality will be negative and
cannot be greater than the right term which is always positive.

In the one-dimensional case, the inner product $(x\cdot y)$ can be
written as $\lambda xy$ for some positive $\lambda$. Then the last
inequality becomes:
\begin{displaymath}
  2\geq \frac{\lambda y^2}{1+\lambda y^2} 
+ \frac{\lambda x_0^2}{1+\lambda x_0^2}
\end{displaymath}
which is trivially true (a more direct proof for the one-dimensional
case is of course possible).
\nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11}
\bibliographystyle{plain}
\bibliography{notes.bib}

\end{document}