diff options
Diffstat (limited to 'notes.tex')
| -rw-r--r-- | notes.tex | 199 |
1 files changed, 199 insertions, 0 deletions
diff --git a/notes.tex b/notes.tex new file mode 100644 index 0000000..f75d2b7 --- /dev/null +++ b/notes.tex @@ -0,0 +1,199 @@ +\documentclass{article} +\usepackage[utf8]{inputenc} +\usepackage{amsmath,amsthm,amsfonts} +\newtheorem{lemma}{Lemma} +\newcommand{\var}{\mathop{\mathrm{Var}}} +\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]} +\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} +\newcommand{\norm}[1]{\lVert#1\rVert} +\newcommand{\tr}[1]{#1^*} +\newcommand{\mse}{\mathop{\mathrm{MSE}}} +\begin{document} + +\section{Understanding the recommender system} + +\subsection{General problem} + +We have a database $D_n$ consisting of $n$ pairs $(x_i,y_i)_{1\leq +i\leq n}$. $x_i$ is a vector of explanatory variables (could be only +one number in the bivariate case) and $y_i$ is the variable of +interest. + +From the database we want to compute a regression function $f_n$ such +that we are able to compute the variable of interest $y$ from a new +vector of explanatory variables $x$. + +The cost of the regression error will be measured by the MSE: +\begin{displaymath} + \mathrm{MSE}(f_n) = \expt{\big(f_n(x)-y\big)^2} +\end{displaymath} + +The general goal is to understand how the size of the database impacts +the MSE of the derived regression function. + +\subsection{From the bivariate normal case to linear regression} +If $(X,Y)$ is drawn from a bivariate normal distribution. Then, one can +write: +\begin{displaymath} + Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big) +\end{displaymath} + +\subsection{Linear regression} + +We assume a linear model: +\begin{displaymath} + y_i = \beta\cdot x_i + \varepsilon_i +\end{displaymath} + +Where $\varepsilon_i$ is a normal random variable uncorrelated to +$x_i$. We also assume that $\var(\varepsilon_i)$ is constant +(homoscedasticity), $\sigma^2$ will denote the common value. + +From the database we compute the least-squared estimator of $\beta$: +\begin{displaymath} + \hat{\beta}_n = (\tr XX)^{-1}\tr XY +\end{displaymath} +where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory +variables) whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$. + +The regression function is simply the inner product of $\hat{\beta}_n$ +and the new vector of explanatory variables $x$. + +From there we can compute the MSE: +\begin{displaymath} + \mse(D_n) + =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2} + = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2} +\end{displaymath} +where we used that +$\expt{x\varepsilon}=0$. The variance-covariance matrix of +$\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get: +\begin{displaymath} + \mse(D_n) + = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2 +\end{displaymath} + +\subsubsection*{Monotonicity} + +We first want to study the impact of adding one observation to the +database. First, notice that: +\begin{displaymath} + \tr XX = \sum_{i=1}^n x_i \tr x_i +\end{displaymath} +Let write $A= \tr X X$. Then, adding $x_0$ to the database will change +$A$ to $A+x_0\tr x_0$. + +The following derivation will make an extensive use of the following +formula found in \cite{inverse} (which can be proven by direct +verification). For any invertible matrix $A$: +\begin{equation}\label{eq:inverse} +(A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y} +\end{equation} + +$A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is +also positive semidefinite. We will denote by $x\cdot y$ the scalar product defined by $A^{-1}$, +that is: +\begin{displaymath} + x\cdot y = \tr x A^{-1} y = \tr y A^{-1} x +\end{displaymath} + +Using \eqref{eq:inverse} we get: +\begin{displaymath} +\begin{split} + \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x + A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\ +& = \tr x A^{-1} x - \frac{(x\cdot x_0)^2}{1+\norm{x_0}^2} +\end{split} +\end{displaymath} + +Thus: +\begin{equation}\label{eq:decrease} +\mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2(x\cdot x_0)^2}{1+\norm{x_0}^2} +\end{equation} + +\emph{Adding one observation to the database decreases the MSE.} + +\subsubsection*{Submodularity} + +Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained +from $D_n$ by adding some observations. We would like to show that +adding one observation to $D_m$ yields a smaller decrease in MSE than +adding the same observation to $D_n$: +\begin{displaymath} + \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\}) +\end{displaymath} + +First remark that it is necessary and sufficient to prove this property when $D_n$ and +$D_m$ differ by only one observation. Indeed, if the property is true +in general, then it will also be true when the two databases differ by +only one observation. Conversely, if the property is true when the two +databases differ by only one observation, then applying the property +repeatedly yields the general property. + +Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to +the database is: +\begin{displaymath} + \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} +\end{displaymath} + +If we denote by $y$ the additional observation present in $D_m$ and +not in $D_n$, then we would like to prove that: +\begin{displaymath} + \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} +\geq \frac{\sigma^2\left(\tr x (A+y\tr y)^{-1} x_0\right)^2}{1+\tr x_0 (A+y\tr y)^{-1} x_0} +\end{displaymath} + +Using the same notations as before, this is equivalent +to: +\begin{displaymath} + \frac{(x\cdot x_0)^2}{1+\norm{x_0}^2} +\geq \frac{\left(\left(1+\norm{y}^2\right)(x\cdot x_0)-(x\cdot + y)(y\cdot x_0)\right)^2} +{\left(1+\norm{y}^2\right)\big((1+\norm{y}^2)(1+\norm{x_0}^2)-(x_0\cdot +y)^2\big)} +\end{displaymath} + +By the Cauchy-Schwarz inequality: +\begin{displaymath} + (1+\norm{y}^2)(1+\norm{x_0}^2)-(x_0\cdot +y)^2 \geq 0 +\end{displaymath} + +Thus the previous inequality is consecutively equivalent to: +\begin{multline*} + \left(1+\norm{y}^2\right)^2\left(1+\norm{x_0}^2\right)(x\cdot x_0)^2 +-\left(1+\norm{y}^2\right)(x_0\cdot y)^2(x\cdot x_0)^2\\ +\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)^2(x\cdot +x_0)^2 ++ \left(1+\norm{x_0}^2\right)(x\cdot y)^2(y\cdot x_0)^2\\ +-2\left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)(x\cdot +x_0)(x\cdot y)(y\cdot x_0) +\end{multline*} + +\begin{multline*} +2\left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)(x\cdot +x_0)(x\cdot y)(y\cdot x_0)\\ +\geq \left(1+\norm{x_0}^2\right)(x\cdot y)^2(y\cdot x_0)^2 ++ \left(1+\norm{y}^2\right)(x_0\cdot y)^2(x\cdot x_0)^2 +\end{multline*} + +This last inequality is not true in general. As soon as $x$, $y$ and +$z$ span a 2-dimensional space, it is possible that for example +$(x\cdot x_0)$ and $(x\cdot y)$ are positive and $(y\cdot x_0)$ +negative. Then the left term of the inequality will be negative and +cannot be greater than the right term which is always positive. + +In the one-dimensional case, the inner product $(x\cdot y)$ can be +written as $\lambda xy$ for some positive $\lambda$. Then the last +inequality becomes: +\begin{displaymath} + 2\geq \frac{\lambda y^2}{1+\lambda y^2} ++ \frac{\lambda x_0^2}{1+\lambda x_0^2} +\end{displaymath} +which is trivially true (a more direct proof for the one-dimensional +case is of course possible). +\nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11} +\bibliographystyle{plain} +\bibliography{notes.bib} + +\end{document}
\ No newline at end of file |
