\documentclass{article} \usepackage[utf8]{inputenc} \usepackage{amsmath,amsthm,amsfonts} \usepackage{comment} \newtheorem{lemma}{Lemma} \newcommand{\var}{\mathop{\mathrm{Var}}} \newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]} \newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} \newcommand{\norm}[1]{\lVert#1\rVert} \newcommand{\tr}[1]{#1^*} \newcommand{\ip}[2]{\langle #1, #2 \rangle} \newcommand{\mse}{\mathop{\mathrm{MSE}}} \newcommand{\trace}{\mathop{\mathrm{tr}}} \begin{document} \section{Understanding the recommender system} \subsection{General problem} We already have a database $D_n$ of $n$ users. For each user $i$ we have a set of $k$ explanatory variables (features), this is a vector $x_i$. The problem is the following: we are about to start an experiment where for some of the users in the database, a variable of interest $y_i$ will be revealed (it could be for example a medical survey, a website which buys some data from another website). From the pairs $(x_i, y_i)$ that we will acquire through the experiment, we are going to compute a regression function $f_n$ which will allow us to predict for a new $x$ its associated $y$. The accuracy of this prediction will be measured by the Mean Squared Error : \begin{displaymath} \mse(f_n) = \expt{\big(f_n(x)-y\big)^2} \end{displaymath} We would like to understand the impact of the number of users who take part in the experiment. Especially, how much does adding a user to the experiment impacts the MSE. \subsection{From the bivariate normal case to linear regression} If $(X,Y)$ is drawn from a bivariate normal distribution with mean vector $\mu$ and covariance matrix $\Sigma$. Then, one can write: \begin{displaymath} Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big) \end{displaymath} In this particular case, $\condexp{Y}{X}$ is a linear function of $X$: \begin{displaymath} \condexp{Y}{X} = \alpha X + \beta \end{displaymath} where $\alpha$ and $\beta$ can be expressed as a function of $\mu$ and $\Sigma$. Writing $\varepsilon = Y-\condexp{Y}{X}$, it is easy to see that $\expt{X\varepsilon}=0$. Furthermore $\varepsilon$ is also normally distributed. Under these assumptions, it can be proven that the least square estimator for $(\alpha,\beta)$ is optimal (it reaches the Cramér-Rao bound). \subsection{Linear regression} We assume a linear model: \begin{displaymath} y_i = \beta\cdot x_i + \varepsilon_i \end{displaymath} Where $\varepsilon_i$ is a normal random variable uncorrelated to $x_i$. We also assume that $\var(\varepsilon_i)$ is constant (homoscedasticity), $\sigma^2$ will denote the common value. From the database we compute the least-squared estimator of $\beta$: \begin{displaymath} \hat{\beta}_n = (\tr XX)^{-1}\tr XY \end{displaymath} where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory variables) whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$. The regression function is simply the inner product of $\hat{\beta}_n$ and the new vector of explanatory variables $x$. From there we can compute the MSE: \begin{displaymath} \mse(D_n) =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2} = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2} \end{displaymath} where we used that $\expt{x\varepsilon}=0$. The variance-covariance matrix of $\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get: \begin{displaymath} \mse(D_n) = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2 \end{displaymath} \subsubsection*{Monotonicity} We first want to study the impact of adding one observation to the database. First, notice that: \begin{displaymath} \tr XX = \sum_{i=1}^n x_i \tr x_i \end{displaymath} Let write $A= \tr X X$. Then, adding $x_0$ to the database will change $A$ to $A+x_0\tr x_0$. The following derivation will make an extensive use of the Sherman-Morrisson formula \cite{sm} (which can be proven by direct verification). For any invertible matrix $A$: \begin{equation}\label{eq:inverse} (A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y} \end{equation} $A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is also positive semidefinite. We will denote by $\ip{x}{y}$ the scalar product defined by $A^{-1}$, that is: \begin{displaymath} \ip{x}{y} = \tr x A^{-1} y = \tr y A^{-1} x \end{displaymath} Using \eqref{eq:inverse} we get: \begin{displaymath} \begin{split} \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\ & = \tr x A^{-1} x - \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2} \end{split} \end{displaymath} Thus: \begin{equation}\label{eq:decrease} \mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2\ip{x}{x_0}^2}{1+\norm{x_0}^2} \end{equation} \emph{Adding one observation to the database decreases the MSE.} \subsubsection*{Submodularity} Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained from $D_n$ by adding some observations. We would like to show that adding one observation to $D_m$ yields a smaller decrease in MSE than adding the same observation to $D_n$: \begin{displaymath} \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\}) \end{displaymath} First, remark that it is necessary and sufficient to prove this property when $D_n$ and $D_m$ differ by only one observation. Indeed, if the property is true in general, then it will also be true when the two databases differ by only one observation. Conversely, if the property is true when the two databases differ by only one observation, then applying the property repeatedly yields the general property. Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to the database is: \begin{displaymath} \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} \end{displaymath} If we denote by $z$ the additional observation present in $D_m$ and not in $D_n$, then we would like to prove that: \begin{displaymath} \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} \geq \frac{\sigma^2\left(\tr x (A+z\tr z)^{-1} x_0\right)^2}{1+\tr x_0 (A+z\tr z)^{-1} x_0} \end{displaymath} Using the same notations as before, this is equivalent to: \begin{displaymath} \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2} \geq \frac{\left(\left(1+\norm{z}^2\right)\ip{x}{x_0}-\ip{x}{z}\ip{z}{x_0}\right)^2} {\left(1+\norm{z}^2\right)\big((1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2\big)} \end{displaymath} By the Cauchy-Schwarz inequality: \begin{displaymath} (1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2 > 0 \end{displaymath} Thus the previous inequality is consequently equivalent to: \begin{multline*} \left(1+\norm{z}^2\right)^2\left(1+\norm{x_0}^2\right)\ip{x}{x_0}^2 -\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2\\ \geq \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)^2\ip{x}{x_0}^2 + \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2\\ -2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0} \end{multline*} \begin{multline*} 2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}\\ \geq \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2 + \left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2 \end{multline*} This last inequality is not true in general. As soon as $x$, $x_0$ and $z$ span a 2-dimensional space, it is possible that for example $\ip{x}{x_0}$ and $\ip{x}{z}$ are positive and $\ip{z}{x_0}$ negative. Then the left term of the inequality will be negative and cannot be greater than the right term which is always positive. In the one-dimensional case, the inner product $\ip{x}{z}$ can be written as $\lambda xz$ for some positive $\lambda$. Then the last inequality becomes: \begin{displaymath} 2\geq \frac{\lambda z^2}{1+\lambda z^2} + \frac{\lambda x_0^2}{1+\lambda x_0^2} \end{displaymath} which is trivially true (a more direct proof for the one-dimensional case is of course possible). In order to understand more precisely under which assumptions the above inequality could become true, it is convenient to look at it from the quadratic form perspective. Indeed this inequality can be rewritten as: \begin{equation}\label{eq-inequality} \tr x B x \geq 0 \end{equation} with: \begin{align*} B = &\, \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z} (x_0\tr z+z\tr x_0)\\ & -\ip{x_0}{z}^2\Big( \left(1+\norm{x_0}^2\right)z\tr z + \left(1+\norm{z}^2\right)x_0\tr z\big) \end{align*} This quadratic form is degenerate, its kernel is $x_0^{\bot}\cap z^\bot$ which is of dimension $k-2$. \paragraph{Case when $\norm{x_0}=\norm{z}=1$} In this case, it suffices to study the quadratic form given by matrix $B'$ with: \begin{displaymath} B' = 2\ip{x_0}{z}(x_0\tr z+z\tr x_0) -\ip{x_0}{z}^2(z\tr z + x_0\tr x_0) \end{displaymath} Writing $a = \ip{x_0}{z}$, the two non-zero eigenvalues are: \begin{align*} \lambda_1 & = -2a^3 + 2a^2 + 4a = -2a(a+1)(a-2)\\ \lambda_2 & = 2a^3 + 2a^2 - 4a = 2a(a-1)(a+2) \end{align*} which are respectively associated with the eigenvectors: \begin{align*} x_1 & = x_0+z\\ x_2 & = x_0 - z \end{align*} By the Cauchy-Schwarz inequality, $a\in]-1,1[$, and the two eigenvalues are of opposite sign on this interval. Thus inequality \eqref{eq-inequality} does not hold for all $x$. \paragraph{In expectation?} If we assume a prior knowledge on the distribution of $x$, writing $\Sigma$ the variance-covariance matrix of $x$ and $\mu$ its mean vector, then taking the expectation of \eqref{eq-inequality} we get: \begin{displaymath} \expt{\tr x B' x} = \trace(B'\Sigma) + \tr\mu B'\mu \end{displaymath} \nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11,lse} \bibliographystyle{plain} \bibliography{notes.bib} \section*{Appendix} \paragraph{Previous attempt at taming the submodularity} The inequality only depends on the projection of $x$ on the plane spanned by $x_0$ and $z$. Writing \begin{displaymath} x = \lambda x_0 + \mu z + v,\quad\mathrm{with}\quad v \,\bot\, \mathrm{span}\{x_0, v\} \end{displaymath} the previous inequality becomes: \begin{multline*} 2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z} \left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right) \left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)\\ - \left(1+\norm{x_0}^2\right)\ip{x_0}{z}^2\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)^2\\ - \left(1+\norm{z}^2\right)\ip{x_0}{z}^2\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)^2\geq 0 \end{multline*} By expanding and reordering the terms, we obtain a quadratic function of $\lambda$ and $\mu$: \begin{multline*} \lambda^2\ip{x_0}{z}^2\left[2\norm{x_0}^2+\norm{x_0}^4+\norm{x_0}^2\norm{z}^2 +(1+\norm{x_0})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\ +\mu^2\ip{x_0}{z}^2\left[2\norm{z}^2+\norm{z}^4+\norm{x_0}^2\norm{z}^2 +(1+\norm{z})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\ +2\lambda\mu\ip{x_0}{z}\Big[\norm{x_0}^2\norm{z}^2 \big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\\ +\ip{x_0}{z}^2+\norm{x_0}^2\norm{z}^2 \big(1+\norm{x_0}^2+\norm{z}^2\big)\Big]\geq 0 \end{multline*} This inequality will be true for all $\lambda$ and $\mu$ if and only if the quadratic form is positive semidefinite. As its trace is positive, this is equivalent to the positiveness of its determinant. \end{document}