diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2012-01-16 18:18:10 -0800 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2012-01-16 18:18:10 -0800 |
| commit | 8d311a511a7c673698b08d48e80fa19dc8247a71 (patch) | |
| tree | 54d6aa5d1fb1ec308062f3ee8a52a4ddf8bd2872 | |
| parent | 7a24fd394cc12e1cdab77b3f9c0128ce8d336a15 (diff) | |
| download | recommendation-8d311a511a7c673698b08d48e80fa19dc8247a71.tar.gz | |
Some notes.
| -rw-r--r-- | notes.bib | 140 | ||||
| -rw-r--r-- | notes.tex | 199 |
2 files changed, 339 insertions, 0 deletions
diff --git a/notes.bib b/notes.bib new file mode 100644 index 0000000..7523938 --- /dev/null +++ b/notes.bib @@ -0,0 +1,140 @@ +@article{inverse, + jstor_articletype = {research-article}, + title = {On the Inverse of the Sum of Matrices}, + author = {Miller, Kenneth S.}, + journal = {Mathematics Magazine}, + jstor_issuetitle = {}, + volume = {54}, + number = {2}, + jstor_formatteddate = {Mar., 1981}, + pages = {pp. 67-72}, + url = {http://www.jstor.org/stable/2690437}, + ISSN = {0025570X}, + abstract = {}, + language = {English}, + year = {1981}, + publisher = {Mathematical Association of America}, + copyright = {Copyright © 1981 Mathematical Association of America}, +} + +@article{cook, + jstor_articletype = {research-article}, + title = {Influential Observations in Linear Regression}, + author = {Cook, R. Dennis}, + journal = {Journal of the American Statistical Association}, + jstor_issuetitle = {}, + volume = {74}, + number = {365}, + jstor_formatteddate = {Mar., 1979}, + pages = {pp. 169-174}, + url = {http://www.jstor.org/stable/2286747}, + ISSN = {01621459}, + language = {English}, + year = {1979}, + publisher = {American Statistical Association}, + copyright = {Copyright © 1979 American Statistical Association}, +} + +@article{recommendation, + author = {Paul D{\"u}tting and + Monika Rauch Henzinger and + Ingmar Weber}, + title = {On the Pricing of Recommendations and Recommending Strategically}, + journal = {CoRR}, + volume = {abs/0911.1619}, + year = {2009}, + ee = {http://arxiv.org/abs/0911.1619}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + +@inproceedings{shapley, + author = {Vishal Misra and + Stratis Ioannidis and + Augustin Chaintreau and + Laurent Massouli{\'e}}, + title = {Incentivizing peer-assisted services: a fluid shapley value + approach}, + booktitle = {SIGMETRICS}, + year = {2010}, + pages = {215-226}, + ee = {http://doi.acm.org/10.1145/1811039.1811064}, + crossref = {DBLP:conf/sigmetrics/2010}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + +@proceedings{DBLP:conf/sigmetrics/2010, + editor = {Vishal Misra and + Paul Barford and + Mark S. Squillante}, + title = {SIGMETRICS 2010, Proceedings of the 2010 ACM SIGMETRICS + International Conference on Measurement and Modeling of + Computer Systems, New York, New York, USA, 14-18 June 2010}, + booktitle = {SIGMETRICS}, + publisher = {ACM}, + year = 2010, + isbn = {978-1-4503-0038-4}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + +@article {shapleyor, + author = {Moulin, Hervé and Shenker, Scott}, + affiliation = {Department of Economics, MS 22, Rice University, 6100 Main Street, Houston, TX 77005, USA (e-mail: moulin@rice.edu) US}, + title = {Strategyproof sharing of submodular costs:budget balance versus efficiency}, + journal = {Economic Theory}, + publisher = {Springer Berlin / Heidelberg}, + issn = {0938-2259}, + keyword = {Business and Economics}, + pages = {511-533}, + volume = {18}, + issue = {3}, + url = {http://dx.doi.org/10.1007/PL00004200}, + year = {2001} +} + +@inproceedings{subsetselection11, + author = {Abhimanyu Das and + David Kempe}, + title = {Submodular meets Spectral: Greedy Algorithms for Subset + Selection, Sparse Approximation and Dictionary Selection}, + booktitle = {ICML}, + year = {2011}, + pages = {1057-1064}, + crossref = {DBLP:conf/icml/2011}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + +@proceedings{DBLP:conf/icml/2011, + editor = {Lise Getoor and + Tobias Scheffer}, + title = {Proceedings of the 28th International Conference on Machine + Learning, ICML 2011, Bellevue, Washington, USA, June 28 + - July 2, 2011}, + booktitle = {ICML}, + publisher = {Omnipress}, + year = {2011}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + +@inproceedings{subsetselection08, + author = {Abhimanyu Das and + David Kempe}, + title = {Algorithms for subset selection in linear regression}, + booktitle = {STOC}, + year = 2008, + pages = {45-54}, + ee = {http://doi.acm.org/10.1145/1374376.1374384}, + crossref = {DBLP:conf/stoc/2008}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + +@proceedings{DBLP:conf/stoc/2008, + editor = {Cynthia Dwork}, + title = {Proceedings of the 40th Annual ACM Symposium on Theory of + Computing, Victoria, British Columbia, Canada, May 17-20, + 2008}, + booktitle = {STOC}, + publisher = {ACM}, + year = {2008}, + isbn = {978-1-60558-047-0}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} diff --git a/notes.tex b/notes.tex new file mode 100644 index 0000000..f75d2b7 --- /dev/null +++ b/notes.tex @@ -0,0 +1,199 @@ +\documentclass{article} +\usepackage[utf8]{inputenc} +\usepackage{amsmath,amsthm,amsfonts} +\newtheorem{lemma}{Lemma} +\newcommand{\var}{\mathop{\mathrm{Var}}} +\newcommand{\condexp}[2]{\mathop{\mathbb{E}}\left[#1|#2\right]} +\newcommand{\expt}[1]{\mathop{\mathbb{E}}\left[#1\right]} +\newcommand{\norm}[1]{\lVert#1\rVert} +\newcommand{\tr}[1]{#1^*} +\newcommand{\mse}{\mathop{\mathrm{MSE}}} +\begin{document} + +\section{Understanding the recommender system} + +\subsection{General problem} + +We have a database $D_n$ consisting of $n$ pairs $(x_i,y_i)_{1\leq +i\leq n}$. $x_i$ is a vector of explanatory variables (could be only +one number in the bivariate case) and $y_i$ is the variable of +interest. + +From the database we want to compute a regression function $f_n$ such +that we are able to compute the variable of interest $y$ from a new +vector of explanatory variables $x$. + +The cost of the regression error will be measured by the MSE: +\begin{displaymath} + \mathrm{MSE}(f_n) = \expt{\big(f_n(x)-y\big)^2} +\end{displaymath} + +The general goal is to understand how the size of the database impacts +the MSE of the derived regression function. + +\subsection{From the bivariate normal case to linear regression} +If $(X,Y)$ is drawn from a bivariate normal distribution. Then, one can +write: +\begin{displaymath} + Y = \condexp{Y}{X} + \big(Y-\condexp{Y}{X}\big) +\end{displaymath} + +\subsection{Linear regression} + +We assume a linear model: +\begin{displaymath} + y_i = \beta\cdot x_i + \varepsilon_i +\end{displaymath} + +Where $\varepsilon_i$ is a normal random variable uncorrelated to +$x_i$. We also assume that $\var(\varepsilon_i)$ is constant +(homoscedasticity), $\sigma^2$ will denote the common value. + +From the database we compute the least-squared estimator of $\beta$: +\begin{displaymath} + \hat{\beta}_n = (\tr XX)^{-1}\tr XY +\end{displaymath} +where $X$ is a $n\times k$ matrix ($k$ is the number of explanatory +variables) whose $i$-th row $\tr x_i$ and $Y$ is $(y_1,\ldots,y_n)$. + +The regression function is simply the inner product of $\hat{\beta}_n$ +and the new vector of explanatory variables $x$. + +From there we can compute the MSE: +\begin{displaymath} + \mse(D_n) + =\expt{\left(\beta\cdot x + \varepsilon - \hat\beta_n\cdot x\right)^2} + = \tr x\expt{(\hat\beta_n-\beta)\cdot (\hat\beta_n-\beta)} x + \expt{\varepsilon^2} +\end{displaymath} +where we used that +$\expt{x\varepsilon}=0$. The variance-covariance matrix of +$\hat\beta_n$ is equal to $\sigma^2(\tr XX)^{-1}$. Finally we get: +\begin{displaymath} + \mse(D_n) + = \sigma^2\tr x(\tr XX)^{-1}x + \sigma^2 +\end{displaymath} + +\subsubsection*{Monotonicity} + +We first want to study the impact of adding one observation to the +database. First, notice that: +\begin{displaymath} + \tr XX = \sum_{i=1}^n x_i \tr x_i +\end{displaymath} +Let write $A= \tr X X$. Then, adding $x_0$ to the database will change +$A$ to $A+x_0\tr x_0$. + +The following derivation will make an extensive use of the following +formula found in \cite{inverse} (which can be proven by direct +verification). For any invertible matrix $A$: +\begin{equation}\label{eq:inverse} +(A+x\tr y)^{-1} = A^{-1} - \frac{A^{-1}x\tr yA^{-1}}{1+\tr x A^{-1}y} +\end{equation} + +$A^{-1}$ is the inverse of a positive semidefinite matrix. Hence it is +also positive semidefinite. We will denote by $x\cdot y$ the scalar product defined by $A^{-1}$, +that is: +\begin{displaymath} + x\cdot y = \tr x A^{-1} y = \tr y A^{-1} x +\end{displaymath} + +Using \eqref{eq:inverse} we get: +\begin{displaymath} +\begin{split} + \tr x (A + x_0\tr x_0)^{-1} x & = \tr x A^{-1} x - \frac{\tr x + A^{-1}x_0\tr x_0A^{-1} x}{1+\tr x_0 A^{-1}x_0 }\\ +& = \tr x A^{-1} x - \frac{(x\cdot x_0)^2}{1+\norm{x_0}^2} +\end{split} +\end{displaymath} + +Thus: +\begin{equation}\label{eq:decrease} +\mse(D_n\cup\{x_0\}) = \mse(D_n) - \frac{\sigma^2(x\cdot x_0)^2}{1+\norm{x_0}^2} +\end{equation} + +\emph{Adding one observation to the database decreases the MSE.} + +\subsubsection*{Submodularity} + +Let $D_m$ a database of size $m$ containing $D_n$: $D_m$ is obtained +from $D_n$ by adding some observations. We would like to show that +adding one observation to $D_m$ yields a smaller decrease in MSE than +adding the same observation to $D_n$: +\begin{displaymath} + \mse(D_m)-\mse(D_m\cup\{x_0\})\leq \mse(D_n)-\mse(D_n\cup\{x_0\}) +\end{displaymath} + +First remark that it is necessary and sufficient to prove this property when $D_n$ and +$D_m$ differ by only one observation. Indeed, if the property is true +in general, then it will also be true when the two databases differ by +only one observation. Conversely, if the property is true when the two +databases differ by only one observation, then applying the property +repeatedly yields the general property. + +Using \eqref{eq:decrease}, the decrease of MSE when adding $x_0$ to +the database is: +\begin{displaymath} + \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} +\end{displaymath} + +If we denote by $y$ the additional observation present in $D_m$ and +not in $D_n$, then we would like to prove that: +\begin{displaymath} + \frac{\sigma^2(\tr x A^{-1} x_0)^2}{1+\tr x_0 A^{-1} x_0} +\geq \frac{\sigma^2\left(\tr x (A+y\tr y)^{-1} x_0\right)^2}{1+\tr x_0 (A+y\tr y)^{-1} x_0} +\end{displaymath} + +Using the same notations as before, this is equivalent +to: +\begin{displaymath} + \frac{(x\cdot x_0)^2}{1+\norm{x_0}^2} +\geq \frac{\left(\left(1+\norm{y}^2\right)(x\cdot x_0)-(x\cdot + y)(y\cdot x_0)\right)^2} +{\left(1+\norm{y}^2\right)\big((1+\norm{y}^2)(1+\norm{x_0}^2)-(x_0\cdot +y)^2\big)} +\end{displaymath} + +By the Cauchy-Schwarz inequality: +\begin{displaymath} + (1+\norm{y}^2)(1+\norm{x_0}^2)-(x_0\cdot +y)^2 \geq 0 +\end{displaymath} + +Thus the previous inequality is consecutively equivalent to: +\begin{multline*} + \left(1+\norm{y}^2\right)^2\left(1+\norm{x_0}^2\right)(x\cdot x_0)^2 +-\left(1+\norm{y}^2\right)(x_0\cdot y)^2(x\cdot x_0)^2\\ +\geq \left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)^2(x\cdot +x_0)^2 ++ \left(1+\norm{x_0}^2\right)(x\cdot y)^2(y\cdot x_0)^2\\ +-2\left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)(x\cdot +x_0)(x\cdot y)(y\cdot x_0) +\end{multline*} + +\begin{multline*} +2\left(1+\norm{x_0}^2\right)\left(1+\norm{y}^2\right)(x\cdot +x_0)(x\cdot y)(y\cdot x_0)\\ +\geq \left(1+\norm{x_0}^2\right)(x\cdot y)^2(y\cdot x_0)^2 ++ \left(1+\norm{y}^2\right)(x_0\cdot y)^2(x\cdot x_0)^2 +\end{multline*} + +This last inequality is not true in general. As soon as $x$, $y$ and +$z$ span a 2-dimensional space, it is possible that for example +$(x\cdot x_0)$ and $(x\cdot y)$ are positive and $(y\cdot x_0)$ +negative. Then the left term of the inequality will be negative and +cannot be greater than the right term which is always positive. + +In the one-dimensional case, the inner product $(x\cdot y)$ can be +written as $\lambda xy$ for some positive $\lambda$. Then the last +inequality becomes: +\begin{displaymath} + 2\geq \frac{\lambda y^2}{1+\lambda y^2} ++ \frac{\lambda x_0^2}{1+\lambda x_0^2} +\end{displaymath} +which is trivially true (a more direct proof for the one-dimensional +case is of course possible). +\nocite{shapley,inverse,recommendation,cook,shapleyor,subsetselection11} +\bibliographystyle{plain} +\bibliography{notes.bib} + +\end{document}
\ No newline at end of file |
