3 files changed, 68 insertions, 15 deletions
diff --git a/notes.bib b/notes.bib
index 7523938..785cf1e 100644
--- a/notes.bib
+++ b/notes.bib
@@ -138,3 +138,14 @@
   isbn      = {978-1-60558-047-0},
   bibsource = {DBLP, http://dblp.uni-trier.de}
 }
+
+@article{sm,
+  title={Adjustment of an inverse matrix corresponding to a change in one element of a given matrix},
+  author={Sherman, J. and Morrison, W.J.},
+  journal={The Annals of Mathematical Statistics},
+  volume={21},
+  number={1},
+  pages={124--127},
+  year={1950},
+  publisher={JSTOR}
+}
diff --git a/notes.tex b/notes.tex
index cd07a10..25ca121 100644
--- a/notes.tex
+++ b/notes.tex
@@ -15,22 +15,25 @@
 
 \subsection{General problem}
 
-We have a database $D_n$ consisting of $n$ pairs $(x_i,y_i)_{1\leq
-i\leq n}$. $x_i$ is a vector of explanatory variables (could be only
-one number in the bivariate case) and $y_i$ is the variable of
-interest.
+We already have a database $D_n$ of $n$ users. For each user $i$ we
+have a set of explanatory variables (features), this is a vector
+$x_i$.
 
-From the database we want to compute a regression function $f_n$ such
-that we are able to compute the variable of interest $y$ from a new
-vector of explanatory variables $x$.
-
-The cost of the regression error will be measured by the MSE:
+The problem is the following: we are about to start an experiment where for
+some of the users in the database, a variable of interest $y_i$ will
+be revealed (it could be for example a medical survey, a website which
+buys some data from another website). From the pairs $(x_i, y_i)$ that
+we will acquire through the experiment, we are going to compute a
+regression function $f_n$ which will allow us to predict for a new $x$
+its associated $y$. The accuracy of this prediction will be measured
+by the Mean Squared Error :
 \begin{displaymath}
   \mse(f_n) = \expt{\big(f_n(x)-y\big)^2}
 \end{displaymath}
 
-The general goal is to understand how the size of the database impacts
-the MSE of the regression function.
+We would like to understand the impact of the number of users who take
+part in the experiment. Especially, how much does adding a user to the
+experiment impacts the MSE.
 
 \subsection{From the bivariate normal case to linear regression}
 If $(X,Y)$ is drawn from a bivariate normal distribution with mean
@@ -95,8 +98,8 @@ database. First, notice that:
 Let write $A= \tr X X$. Then, adding $x_0$ to the database will change
 $A$ to $A+x_0\tr x_0$.
 
-The following derivation will make an extensive use of the following
-formula found in \cite{inverse} (which can be proven by direct
+The following derivation will make an extensive use of the
+Sherman-Morrisson formula \cite{sm} (which can be proven by direct
 verification). For any invertible matrix $A$:
 \begin{equation}\label{eq:inverse}
 (A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y}
@@ -135,7 +138,7 @@ adding the same observation to $D_n$:
   \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\})
 \end{displaymath}
 
-First remark that it is necessary and sufficient to prove this property when $D_n$ and
+First, remark that it is necessary and sufficient to prove this property when $D_n$ and
 $D_m$ differ by only one observation. Indeed, if the property is true
 in general, then it will also be true when the two databases differ by
 only one observation. Conversely, if the property is true when the two
@@ -172,7 +175,7 @@ Thus the previous inequality is consequently equivalent to:
 \begin{multline*}
   \left(1+\norm{z}^2\right)^2\left(1+\norm{x_0}^2\right)\ip{x}{x_0}^2
 -\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\ip{x}{x_0}^2\\
-\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)^2\ip{x}{x_0}^2
+\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)^2\ip{x}{x_0}^2
 + \left(1+\norm{x_0}^2\right)\ip{x}{z}^2\ip{z}{x_0}^2\\
 -2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x}{x_0}\ip{x}{z}\ip{z}{x_0}
 \end{multline*}
@@ -198,6 +201,45 @@ inequality becomes:
 \end{displaymath}
 which is trivially true (a more direct proof for the one-dimensional
 case is of course possible).
+
+\subparagraph{Useless attempt}
+
+The inequality only depends on the projection of $x$ on the plane
+spanned by $x_0$ and $z$. Writing
+\begin{displaymath}
+  x = \lambda x_0 + \mu z + v,\quad\mathrm{with}\quad v \,\bot\,
+  \mathrm{span}\{x_0, v\}
+\end{displaymath}
+the previous inequality becomes:
+\begin{multline*}
+2\left(1+\norm{x_0}^2\right)\left(1+\norm{z}^2\right)\ip{x_0}{z}
+\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)
+\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)\\
+- \left(1+\norm{x_0}^2\right)\ip{x_0}{z}^2\left(\lambda\ip{x_0}{z}+\mu\norm{z}^2\right)^2\\
+-
+\left(1+\norm{z}^2\right)\ip{x_0}{z}^2\left(\lambda\norm{x_0}^2+\mu\ip{x_0}{z}\right)^2\geq 0
+\end{multline*}
+
+By expanding and reordering the terms, we obtain a quadratic function
+of $\lambda$ and $\mu$:
+\begin{multline*}
+  \lambda^2\ip{x_0}{z}^2\left[2\norm{x_0}^2+\norm{x_0}^4+\norm{x_0}^2\norm{z}^2
+  +(1+\norm{x_0})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\
++\mu^2\ip{x_0}{z}^2\left[2\norm{z}^2+\norm{z}^4+\norm{x_0}^2\norm{z}^2
+  +(1+\norm{z})^2\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\right]\\
++2\lambda\mu\ip{x_0}{z}\Big[\norm{x_0}^2\norm{z}^2
+\big(\norm{x_0}^2\norm{z}^2-\ip{x_0}{z}^2\big)\\
++\ip{x_0}{z}^2+\norm{x_0}^2\norm{z}^2
+\big(1+\norm{x_0}^2+\norm{z}^2\big)\Big]\geq 0
+\end{multline*}
+
+This inequality will be true for all $\lambda$ and $\mu$ if and only
+if the quadratic form is positive semidefinite. As its trace is
+positive, this is equivalent to the positiveness of its determinant.
+
+\subparagraph{General derivation}
+
+
 \nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11}
 \bibliographystyle{plain}
 \bibliography{notes.bib}
diff --git a/papers/shermanmorrison.pdf b/papers/shermanmorrison.pdf
new file mode 100644
index 0000000..ac1d5b9
--- /dev/null
+++ b/papers/shermanmorrison.pdf