notes.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247

\documentclass{article}
\usepackage[utf8]{inputenc}
\usepackage{amsmath,amsthm,amsfonts}
\newtheorem{lemma}{Lemma}
\newcommand{\var}{\mathop{\mathrm{Var}}}
\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]}
\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]}
\newcommand{\norm}[1]{\lVert#1\rVert}
\newcommand{\tr}[1]{#1^*}
\newcommand{\ip}[2]{\langle #1, #2 \rangle}
\newcommand{\mse}{\mathop{\mathrm{MSE}}}
\begin{document}

\section{Understanding the recommender system}

\subsection{General problem}

We already have a database $D_n$ of $n$ users. For each user $i$ we
have a set of explanatory variables (features), this is a vector
$x_i$.

The problem is the following: we are about to start an experiment where for
some of the users in the database, a variable of interest $y_i$ will
be revealed (it could be for example a medical survey, a website which
buys some data from another website). From the pairs $(x_i, y_i)$ that
we will acquire through the experiment, we are going to compute a
regression function $f_n$ which will allow us to predict for a new $x$
its associated $y$. The accuracy of this prediction will be measured
by the Mean Squared Error :
\begin{displaymath}
  \mse(f_n) = \expt{\big(f_n(x)-y\big)^2}
\end{displaymath}

We would like to understand the impact of the number of users who take
part in the experiment. Especially, how much does adding a user to the
experiment impacts the MSE.

\subsection{From the bivariate normal case to linear regression}
If $(X,Y)$ is drawn from a bivariate normal distribution with mean
vector $\mu$ and covariance matrix $\Sigma$. Then, one can
write:
\begin{displaymath}
  Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big)
\end{displaymath}
In this particular case, $\condexp{Y}{X}$ is a linear function of $X$: 
\begin{displaymath}
\condexp{Y}{X} = \alpha X + \beta  
\end{displaymath}
where $\alpha$ and $\beta$ can be expressed as a function of $\mu$ and
$\Sigma$. Writing $\varepsilon = Y-\condexp{Y}{X}$, it is easy to see
that $\expt{X\varepsilon}=0$. Furthermore $\varepsilon$ is also normally
distributed. Under these assumptions, it can be proven that the least
square estimator for $(\alpha,\beta)$ is optimal (it reaches the
Cramér-Rao bound).

\subsection{Linear regression}

We assume a linear model:
\begin{displaymath}
  y_i = \beta\cdot x_i + \varepsilon_i
\end{displaymath}

Where $\varepsilon_i$ is a normal random variable uncorrelated to
$x_i$. We also assume that $\var(\varepsilon_i)$ is constant
(homoscedasticity), $\sigma^2$ will denote the common value.

From the database we compute the least-squared estimator of $\beta$:
\begin{displaymath}
  \hat{\beta}_n = (\tr XX)^{-1}\tr XY
\end{displaymath}
where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory
variables)  whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$.

The regression function is simply the inner product of $\hat{\beta}_n$
and the new vector of explanatory variables $x$.

From there we can compute the MSE:
\begin{displaymath}
  \mse(D_n)
  =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2}
  = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2}
\end{displaymath}
where we used that
$\expt{x\varepsilon}=0$. The variance-covariance matrix of
$\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get:
\begin{displaymath}
  \mse(D_n)
  = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2
\end{displaymath}

\subsubsection*{Monotonicity}

We first want to study the impact of adding one observation to the
database. First, notice that:
\begin{displaymath}
  \tr XX = \sum_{i=1}^n x_i \tr x_i
\end{displaymath}
Let write $A= \tr X X$. Then, adding $x_0$ to the database will change
$A$ to $A+x_0\tr x_0$.

The following derivation will make an extensive use of the
Sherman-Morrisson formula \cite{sm} (which can be proven by direct
verification). For any invertible matrix $A$:
\begin{equation}\label{eq:inverse}
(A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y}
\end{equation}

$A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is
also positive semidefinite. We will denote by $\ip{x}{y}$ the scalar product defined by $A^{-1}$,
that is:
\begin{displaymath}
  \ip{x}{y} = \tr x A^{-1} y = \tr y A^{-1} x
\end{displaymath}

Using \eqref{eq:inverse} we get:
\begin{displaymath}
\begin{split}
  \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x
  A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\
& = \tr x A^{-1} x - \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2} 
\end{split}
\end{displaymath}

Thus:
\begin{equation}\label{eq:decrease}
\mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2\ip{x}{x_0}^2}{1+\norm{x_0}^2}
\end{equation}

\emph{Adding one observation to the database decreases the MSE.}

\subsubsection*{Submodularity}

Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained
from $D_n$ by adding some observations. We would like to show that
adding one observation to $D_m$ yields a smaller decrease in MSE than
adding the same observation to $D_n$:
\begin{displaymath}
  \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\})
\end{displaymath}

First, remark that it is necessary and sufficient to prove this property when $D_n$ and
$D_m$ differ by only one observation. Indeed, if the property is true
in general, then it will also be true when the two databases differ by
only one observation. Conversely, if the property is true when the two
databases differ by only one observation, then applying the property
repeatedly yields the general property. 

Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to
the database is:
\begin{displaymath}
  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
\end{displaymath}

If we denote by $z$ the additional observation present in $D_m$ and
not in $D_n$, then we would like to prove that:
\begin{displaymath}
  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
\geq   \frac{\sigma^2\left(\tr x (A+z\tr z)^{-1} x_0\right)^2}{1+\tr x_0 (A+z\tr z)^{-1} x_0}
\end{displaymath}

Using the same notations as before, this is equivalent
to:
\begin{displaymath}
  \frac{\ip{x}{x_0}^2}{1+\norm{x_0}^2}
\geq \frac{\left(\left(1+\norm{z}^2\right)\ip{x}{x_0}-\ip{x}{z}\ip{z}{x_0}\right)^2}
{\left(1+\norm{z}^2\right)\big((1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2\big)}
\end{displaymath}

By the Cauchy-Schwarz inequality:
\begin{displaymath}
 (1+\norm{z}^2)(1+\norm{x_0}^2)-\ip{x_0}{z}^2 > 0 
\end{displaymath}

Thus the previous inequality is consequently equivalent to:
\begin{multline*}
  \left(1+\norm{z}^2\right)^2\left(1+\norm{x_0}^2\right)\ip{x}{x_0}^2
-\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2\\
\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)^2\ip{x}{x_0}^2
+ \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2\\
-2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}
\end{multline*}

\begin{multline*}
2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}\\
\geq \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2
+ \left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2
\end{multline*}

This last inequality is not true in general. As soon as $x$, $x_0$ and
$z$ span a 2-dimensional space, it is possible that for example
$\ip{x}{x_0}$ and $\ip{x}{z}$ are positive and $\ip{z}{x_0}$
negative. Then the left term of the inequality will be negative and
cannot be greater than the right term which is always positive.

In the one-dimensional case, the inner product $\ip{x}{z}$ can be
written as $\lambda xz$ for some positive $\lambda$. Then the last
inequality becomes:
\begin{displaymath}
  2\geq \frac{\lambda z^2}{1+\lambda z^2} 
+ \frac{\lambda x_0^2}{1+\lambda x_0^2}
\end{displaymath}
which is trivially true (a more direct proof for the one-dimensional
case is of course possible).

\subparagraph{Useless attempt}

The inequality only depends on the projection of $x$ on the plane
spanned by $x_0$ and $z$. Writing
\begin{displaymath}
  x = \lambda x_0 + \mu z + v,\quad\mathrm{with}\quad v \,\bot\,
  \mathrm{span}\{x_0, v\}
\end{displaymath}
the previous inequality becomes:
\begin{multline*}
2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z}
\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)
\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)\\
- \left(1+\norm{x_0}^2\right)\ip{x_0}{z}^2\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)^2\\
-
\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)^2\geq 0
\end{multline*}

By expanding and reordering the terms, we obtain a quadratic function
of $\lambda$ and $\mu$:
\begin{multline*}
  \lambda^2\ip{x_0}{z}^2\left[2\norm{x_0}^2+\norm{x_0}^4+\norm{x_0}^2\norm{z}^2
  +(1+\norm{x_0})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\
+\mu^2\ip{x_0}{z}^2\left[2\norm{z}^2+\norm{z}^4+\norm{x_0}^2\norm{z}^2
  +(1+\norm{z})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\
+2\lambda\mu\ip{x_0}{z}\Big[\norm{x_0}^2\norm{z}^2
\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\\
+\ip{x_0}{z}^2+\norm{x_0}^2\norm{z}^2
\big(1+\norm{x_0}^2+\norm{z}^2\big)\Big]\geq 0
\end{multline*}

This inequality will be true for all $\lambda$ and $\mu$ if and only
if the quadratic form is positive semidefinite. As its trace is
positive, this is equivalent to the positiveness of its determinant.

\subparagraph{General derivation}


\nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11}
\bibliographystyle{plain}
\bibliography{notes.bib}

\end{document}