Some notes.

author: Thibaut Horel <thibaut.horel@gmail.com> 2012-01-16 18:18:10 -0800
committer: Thibaut Horel <thibaut.horel@gmail.com> 2012-01-16 18:18:10 -0800
commit: 8d311a511a7c673698b08d48e80fa19dc8247a71 (patch)
tree: 54d6aa5d1fb1ec308062f3ee8a52a4ddf8bd2872
parent: 7a24fd394cc12e1cdab77b3f9c0128ce8d336a15 (diff)
download: recommendation-8d311a511a7c673698b08d48e80fa19dc8247a71.tar.gz
2 files changed, 339 insertions, 0 deletions
diff --git a/notes.bib b/notes.bib
new file mode 100644
index 0000000..7523938
--- /dev/null
+++ b/notes.bib
@@ -0,0 +1,140 @@
+@article{inverse,
+     jstor_articletype = {research-article},
+     title = {On the Inverse of the Sum of Matrices},
+     author = {Miller, Kenneth S.},
+     journal = {Mathematics Magazine},
+     jstor_issuetitle = {},
+     volume = {54},
+     number = {2},
+     jstor_formatteddate = {Mar., 1981},
+     pages = {pp. 67-72},
+     url = {http://www.jstor.org/stable/2690437},
+     ISSN = {0025570X},
+     abstract = {},
+     language = {English},
+     year = {1981},
+     publisher = {Mathematical Association of America},
+     copyright = {Copyright © 1981 Mathematical Association of America},
+}
+
+@article{cook,
+     jstor_articletype = {research-article},
+     title = {Influential Observations in Linear Regression},
+     author = {Cook, R. Dennis},
+     journal = {Journal of the American Statistical Association},
+     jstor_issuetitle = {},
+     volume = {74},
+     number = {365},
+     jstor_formatteddate = {Mar., 1979},
+     pages = {pp. 169-174},
+     url = {http://www.jstor.org/stable/2286747},
+     ISSN = {01621459},
+     language = {English},
+     year = {1979},
+     publisher = {American Statistical Association},
+     copyright = {Copyright © 1979 American Statistical Association},
+}
+
+@article{recommendation,
+  author    = {Paul D{\"u}tting and
+               Monika Rauch Henzinger and
+               Ingmar Weber},
+  title     = {On the Pricing of Recommendations and Recommending Strategically},
+  journal   = {CoRR},
+  volume    = {abs/0911.1619},
+  year      = {2009},
+  ee        = {http://arxiv.org/abs/0911.1619},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@inproceedings{shapley,
+  author    = {Vishal Misra and
+               Stratis Ioannidis and
+               Augustin Chaintreau and
+               Laurent Massouli{\'e}},
+  title     = {Incentivizing peer-assisted services: a fluid shapley value
+               approach},
+  booktitle = {SIGMETRICS},
+  year      = {2010},
+  pages     = {215-226},
+  ee        = {http://doi.acm.org/10.1145/1811039.1811064},
+  crossref  = {DBLP:conf/sigmetrics/2010},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@proceedings{DBLP:conf/sigmetrics/2010,
+  editor    = {Vishal Misra and
+               Paul Barford and
+               Mark S. Squillante},
+  title     = {SIGMETRICS 2010, Proceedings of the 2010 ACM SIGMETRICS
+               International Conference on Measurement and Modeling of
+               Computer Systems, New York, New York, USA, 14-18 June 2010},
+  booktitle = {SIGMETRICS},
+  publisher = {ACM},
+  year      = 2010,
+  isbn      = {978-1-4503-0038-4},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@article {shapleyor,
+   author = {Moulin, Hervé and Shenker, Scott},
+   affiliation = {Department of Economics, MS 22, Rice University, 6100 Main Street, Houston, TX 77005, USA (e-mail: moulin@rice.edu) US},
+   title = {Strategyproof sharing of submodular costs:budget balance versus efficiency},
+   journal = {Economic Theory},
+   publisher = {Springer Berlin / Heidelberg},
+   issn = {0938-2259},
+   keyword = {Business and Economics},
+   pages = {511-533},
+   volume = {18},
+   issue = {3},
+   url = {http://dx.doi.org/10.1007/PL00004200},
+   year = {2001}
+}
+
+@inproceedings{subsetselection11,
+  author    = {Abhimanyu Das and
+               David Kempe},
+  title     = {Submodular meets Spectral: Greedy Algorithms for Subset
+               Selection, Sparse Approximation and Dictionary Selection},
+  booktitle = {ICML},
+  year      = {2011},
+  pages     = {1057-1064},
+  crossref  = {DBLP:conf/icml/2011},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@proceedings{DBLP:conf/icml/2011,
+  editor    = {Lise Getoor and
+               Tobias Scheffer},
+  title     = {Proceedings of the 28th International Conference on Machine
+               Learning, ICML 2011, Bellevue, Washington, USA, June 28
+               - July 2, 2011},
+  booktitle = {ICML},
+  publisher = {Omnipress},
+  year      = {2011},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@inproceedings{subsetselection08,
+  author    = {Abhimanyu Das and
+               David Kempe},
+  title     = {Algorithms for subset selection in linear regression},
+  booktitle = {STOC},
+  year      = 2008,
+  pages     = {45-54},
+  ee        = {http://doi.acm.org/10.1145/1374376.1374384},
+  crossref  = {DBLP:conf/stoc/2008},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@proceedings{DBLP:conf/stoc/2008,
+  editor    = {Cynthia Dwork},
+  title     = {Proceedings of the 40th Annual ACM Symposium on Theory of
+               Computing, Victoria, British Columbia, Canada, May 17-20,
+               2008},
+  booktitle = {STOC},
+  publisher = {ACM},
+  year      = {2008},
+  isbn      = {978-1-60558-047-0},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
diff --git a/notes.tex b/notes.tex
new file mode 100644
index 0000000..f75d2b7
--- /dev/null
+++ b/notes.tex
@@ -0,0 +1,199 @@
+\documentclass{article}
+\usepackage[utf8]{inputenc}
+\usepackage{amsmath,amsthm,amsfonts}
+\newtheorem{lemma}{Lemma}
+\newcommand{\var}{\mathop{\mathrm{Var}}}
+\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]}
+\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]}
+\newcommand{\norm}[1]{\lVert#1\rVert}
+\newcommand{\tr}[1]{#1^*}
+\newcommand{\mse}{\mathop{\mathrm{MSE}}}
+\begin{document}
+
+\section{Understanding the recommender system}
+
+\subsection{General problem}
+
+We have a database $D_n$ consisting of $n$ pairs $(x_i,y_i)_{1\leq
+i\leq n}$. $x_i$ is a vector of explanatory variables (could be only
+one number in the bivariate case) and $y_i$ is the variable of
+interest.
+
+From the database we want to compute a regression function $f_n$ such
+that we are able to compute the variable of interest $y$ from a new
+vector of explanatory variables $x$.
+
+The cost of the regression error will be measured by the MSE:
+\begin{displaymath}
+  \mathrm{MSE}(f_n) = \expt{\big(f_n(x)-y\big)^2}
+\end{displaymath}
+
+The general goal is to understand how the size of the database impacts
+the MSE of the derived regression function.
+
+\subsection{From the bivariate normal case to linear regression}
+If $(X,Y)$ is drawn from a bivariate normal distribution. Then, one can
+write:
+\begin{displaymath}
+  Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big)
+\end{displaymath}
+
+\subsection{Linear regression}
+
+We assume a linear model:
+\begin{displaymath}
+  y_i = \beta\cdot x_i + \varepsilon_i
+\end{displaymath}
+
+Where $\varepsilon_i$ is a normal random variable uncorrelated to
+$x_i$. We also assume that $\var(\varepsilon_i)$ is constant
+(homoscedasticity), $\sigma^2$ will denote the common value.
+
+From the database we compute the least-squared estimator of $\beta$:
+\begin{displaymath}
+  \hat{\beta}_n = (\tr XX)^{-1}\tr XY
+\end{displaymath}
+where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory
+variables)  whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$.
+
+The regression function is simply the inner product of $\hat{\beta}_n$
+and the new vector of explanatory variables $x$.
+
+From there we can compute the MSE:
+\begin{displaymath}
+  \mse(D_n)
+  =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2}
+  = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2}
+\end{displaymath}
+where we used that
+$\expt{x\varepsilon}=0$. The variance-covariance matrix of
+$\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get:
+\begin{displaymath}
+  \mse(D_n)
+  = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2
+\end{displaymath}
+
+\subsubsection*{Monotonicity}
+
+We first want to study the impact of adding one observation to the
+database. First, notice that:
+\begin{displaymath}
+  \tr XX = \sum_{i=1}^n x_i \tr x_i
+\end{displaymath}
+Let write $A= \tr X X$. Then, adding $x_0$ to the database will change
+$A$ to $A+x_0\tr x_0$.
+
+The following derivation will make an extensive use of the following
+formula found in \cite{inverse} (which can be proven by direct
+verification). For any invertible matrix $A$:
+\begin{equation}\label{eq:inverse}
+(A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y}
+\end{equation}
+
+$A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is
+also positive semidefinite. We will denote by $x\cdot y$ the scalar product defined by $A^{-1}$,
+that is:
+\begin{displaymath}
+  x\cdot y = \tr x A^{-1} y = \tr y A^{-1} x
+\end{displaymath}
+
+Using \eqref{eq:inverse} we get:
+\begin{displaymath}
+\begin{split}
+  \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x
+  A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\
+& = \tr x A^{-1} x - \frac{(x\cdot x_0)^2}{1+\norm{x_0}^2} 
+\end{split}
+\end{displaymath}
+
+Thus:
+\begin{equation}\label{eq:decrease}
+\mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2(x\cdot x_0)^2}{1+\norm{x_0}^2}
+\end{equation}
+
+\emph{Adding one observation to the database decreases the MSE.}
+
+\subsubsection*{Submodularity}
+
+Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained
+from $D_n$ by adding some observations. We would like to show that
+adding one observation to $D_m$ yields a smaller decrease in MSE than
+adding the same observation to $D_n$:
+\begin{displaymath}
+  \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\})
+\end{displaymath}
+
+First remark that it is necessary and sufficient to prove this property when $D_n$ and
+$D_m$ differ by only one observation. Indeed, if the property is true
+in general, then it will also be true when the two databases differ by
+only one observation. Conversely, if the property is true when the two
+databases differ by only one observation, then applying the property
+repeatedly yields the general property. 
+
+Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to
+the database is:
+\begin{displaymath}
+  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
+\end{displaymath}
+
+If we denote by $y$ the additional observation present in $D_m$ and
+not in $D_n$, then we would like to prove that:
+\begin{displaymath}
+  \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0}
+\geq   \frac{\sigma^2\left(\tr x (A+y\tr y)^{-1} x_0\right)^2}{1+\tr x_0 (A+y\tr y)^{-1} x_0}
+\end{displaymath}
+
+Using the same notations as before, this is equivalent
+to:
+\begin{displaymath}
+  \frac{(x\cdot x_0)^2}{1+\norm{x_0}^2}
+\geq \frac{\left(\left(1+\norm{y}^2\right)(x\cdot x_0)-(x\cdot
+  y)(y\cdot x_0)\right)^2}
+{\left(1+\norm{y}^2\right)\big((1+\norm{y}^2)(1+\norm{x_0}^2)-(x_0\cdot
+y)^2\big)}
+\end{displaymath}
+
+By the Cauchy-Schwarz inequality:
+\begin{displaymath}
+ (1+\norm{y}^2)(1+\norm{x_0}^2)-(x_0\cdot
+y)^2 \geq 0 
+\end{displaymath}
+
+Thus the previous inequality is consecutively equivalent to:
+\begin{multline*}
+  \left(1+\norm{y}^2\right)^2\left(1+\norm{x_0}^2\right)(x\cdot x_0)^2
+-\left(1+\norm{y}^2\right)(x_0\cdot y)^2(x\cdot x_0)^2\\
+\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)^2(x\cdot
+x_0)^2
++ \left(1+\norm{x_0}^2\right)(x\cdot y)^2(y\cdot x_0)^2\\
+-2\left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)(x\cdot
+x_0)(x\cdot  y)(y\cdot x_0)
+\end{multline*}
+
+\begin{multline*}
+2\left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)(x\cdot
+x_0)(x\cdot  y)(y\cdot x_0)\\
+\geq \left(1+\norm{x_0}^2\right)(x\cdot y)^2(y\cdot x_0)^2
++ \left(1+\norm{y}^2\right)(x_0\cdot y)^2(x\cdot x_0)^2
+\end{multline*}
+
+This last inequality is not true in general. As soon as $x$, $y$ and
+$z$ span a 2-dimensional space, it is possible that for example
+$(x\cdot x_0)$ and $(x\cdot y)$ are positive and $(y\cdot x_0)$
+negative. Then the left term of the inequality will be negative and
+cannot be greater than the right term which is always positive.
+
+In the one-dimensional case, the inner product $(x\cdot y)$ can be
+written as $\lambda xy$ for some positive $\lambda$. Then the last
+inequality becomes:
+\begin{displaymath}
+  2\geq \frac{\lambda y^2}{1+\lambda y^2} 
++ \frac{\lambda x_0^2}{1+\lambda x_0^2}
+\end{displaymath}
+which is trivially true (a more direct proof for the one-dimensional
+case is of course possible).
+\nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11}
+\bibliographystyle{plain}
+\bibliography{notes.bib}
+
+\end{document}
+\ No newline at end of file
author	Thibaut Horel <thibaut.horel@gmail.com>	2012-01-16 18:18:10 -0800
committer	Thibaut Horel <thibaut.horel@gmail.com>	2012-01-16 18:18:10 -0800
commit	8d311a511a7c673698b08d48e80fa19dc8247a71 (patch)
tree	54d6aa5d1fb1ec308062f3ee8a52a4ddf8bd2872
parent	7a24fd394cc12e1cdab77b3f9c0128ce8d336a15 (diff)
download	recommendation-8d311a511a7c673698b08d48e80fa19dc8247a71.tar.gz