\documentclass{article} \usepackage[utf8]{inputenc} \usepackage{amsmath,amsthm,amsfonts} \newtheorem{lemma}{Lemma} \newcommand{\var}{\mathop{\mathrm{Var}}} \newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]} \newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} \newcommand{\norm}[1]{\lVert#1\rVert} \newcommand{\tr}[1]{#1^*} \newcommand{\mse}{\mathop{\mathrm{MSE}}} \begin{document} \section{Understanding the recommender system} \subsection{General problem} We have a database $D_n$ consisting of $n$ pairs $(x_i,y_i)_{1\leq i\leq n}$. $x_i$ is a vector of explanatory variables (could be only one number in the bivariate case) and $y_i$ is the variable of interest. From the database we want to compute a regression function $f_n$ such that we are able to compute the variable of interest $y$ from a new vector of explanatory variables $x$. The cost of the regression error will be measured by the MSE: \begin{displaymath} \mathrm{MSE}(f_n) = \expt{\big(f_n(x)-y\big)^2} \end{displaymath} The general goal is to understand how the size of the database impacts the MSE of the derived regression function. \subsection{From the bivariate normal case to linear regression} If $(X,Y)$ is drawn from a bivariate normal distribution. Then, one can write: \begin{displaymath} Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big) \end{displaymath} \subsection{Linear regression} We assume a linear model: \begin{displaymath} y_i = \beta\cdot x_i + \varepsilon_i \end{displaymath} Where $\varepsilon_i$ is a normal random variable uncorrelated to $x_i$. We also assume that $\var(\varepsilon_i)$ is constant (homoscedasticity), $\sigma^2$ will denote the common value. From the database we compute the least-squared estimator of $\beta$: \begin{displaymath} \hat{\beta}_n = (\tr XX)^{-1}\tr XY \end{displaymath} where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory variables) whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$. The regression function is simply the inner product of $\hat{\beta}_n$ and the new vector of explanatory variables $x$. From there we can compute the MSE: \begin{displaymath} \mse(D_n) =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2} = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2} \end{displaymath} where we used that $\expt{x\varepsilon}=0$. The variance-covariance matrix of $\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get: \begin{displaymath} \mse(D_n) = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2 \end{displaymath} \subsubsection*{Monotonicity} We first want to study the impact of adding one observation to the database. First, notice that: \begin{displaymath} \tr XX = \sum_{i=1}^n x_i \tr x_i \end{displaymath} Let write $A= \tr X X$. Then, adding $x_0$ to the database will change $A$ to $A+x_0\tr x_0$. The following derivation will make an extensive use of the following formula found in \cite{inverse} (which can be proven by direct verification). For any invertible matrix $A$: \begin{equation}\label{eq:inverse} (A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y} \end{equation} $A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is also positive semidefinite. We will denote by $x\cdot y$ the scalar product defined by $A^{-1}$, that is: \begin{displaymath} x\cdot y = \tr x A^{-1} y = \tr y A^{-1} x \end{displaymath} Using \eqref{eq:inverse} we get: \begin{displaymath} \begin{split} \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\ & = \tr x A^{-1} x - \frac{(x\cdot x_0)^2}{1+\norm{x_0}^2} \end{split} \end{displaymath} Thus: \begin{equation}\label{eq:decrease} \mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2(x\cdot x_0)^2}{1+\norm{x_0}^2} \end{equation} \emph{Adding one observation to the database decreases the MSE.} \subsubsection*{Submodularity} Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained from $D_n$ by adding some observations. We would like to show that adding one observation to $D_m$ yields a smaller decrease in MSE than adding the same observation to $D_n$: \begin{displaymath} \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\}) \end{displaymath} First remark that it is necessary and sufficient to prove this property when $D_n$ and $D_m$ differ by only one observation. Indeed, if the property is true in general, then it will also be true when the two databases differ by only one observation. Conversely, if the property is true when the two databases differ by only one observation, then applying the property repeatedly yields the general property. Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to the database is: \begin{displaymath} \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} \end{displaymath} If we denote by $y$ the additional observation present in $D_m$ and not in $D_n$, then we would like to prove that: \begin{displaymath} \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} \geq \frac{\sigma^2\left(\tr x (A+y\tr y)^{-1} x_0\right)^2}{1+\tr x_0 (A+y\tr y)^{-1} x_0} \end{displaymath} Using the same notations as before, this is equivalent to: \begin{displaymath} \frac{(x\cdot x_0)^2}{1+\norm{x_0}^2} \geq \frac{\left(\left(1+\norm{y}^2\right)(x\cdot x_0)-(x\cdot y)(y\cdot x_0)\right)^2} {\left(1+\norm{y}^2\right)\big((1+\norm{y}^2)(1+\norm{x_0}^2)-(x_0\cdot y)^2\big)} \end{displaymath} By the Cauchy-Schwarz inequality: \begin{displaymath} (1+\norm{y}^2)(1+\norm{x_0}^2)-(x_0\cdot y)^2 \geq 0 \end{displaymath} Thus the previous inequality is consecutively equivalent to: \begin{multline*} \left(1+\norm{y}^2\right)^2\left(1+\norm{x_0}^2\right)(x\cdot x_0)^2 -\left(1+\norm{y}^2\right)(x_0\cdot y)^2(x\cdot x_0)^2\\ \geq \left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)^2(x\cdot x_0)^2 + \left(1+\norm{x_0}^2\right)(x\cdot y)^2(y\cdot x_0)^2\\ -2\left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)(x\cdot x_0)(x\cdot y)(y\cdot x_0) \end{multline*} \begin{multline*} 2\left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)(x\cdot x_0)(x\cdot y)(y\cdot x_0)\\ \geq \left(1+\norm{x_0}^2\right)(x\cdot y)^2(y\cdot x_0)^2 + \left(1+\norm{y}^2\right)(x_0\cdot y)^2(x\cdot x_0)^2 \end{multline*} This last inequality is not true in general. As soon as $x$, $y$ and $z$ span a 2-dimensional space, it is possible that for example $(x\cdot x_0)$ and $(x\cdot y)$ are positive and $(y\cdot x_0)$ negative. Then the left term of the inequality will be negative and cannot be greater than the right term which is always positive. In the one-dimensional case, the inner product $(x\cdot y)$ can be written as $\lambda xy$ for some positive $\lambda$. Then the last inequality becomes: \begin{displaymath} 2\geq \frac{\lambda y^2}{1+\lambda y^2} + \frac{\lambda x_0^2}{1+\lambda x_0^2} \end{displaymath} which is trivially true (a more direct proof for the one-dimensional case is of course possible). \nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11} \bibliographystyle{plain} \bibliography{notes.bib} \end{document}