adding maximum likelihood approach write-up

author: jeanpouget-abadie <jean.pougetabadie@gmail.com> 2015-01-08 17:51:25 +0100
committer: jeanpouget-abadie <jean.pougetabadie@gmail.com> 2015-01-08 17:51:25 +0100
commit: 91287ed4ec2157f1318074c7aa24786d7bcef3a9 (patch)
tree: 84e162b591f3010561f393f52db45bfe0f5f808c /notes
parent: ed0b64814bcfdf94e8b85a148d76318d98f68b9b (diff)
download: cascades-91287ed4ec2157f1318074c7aa24786d7bcef3a9.tar.gz
3 files changed, 224 insertions, 11 deletions
diff --git a/notes/formalisation.pdf b/notes/formalisation.pdf
index c262cb1..760abe0 100644
--- a/notes/formalisation.pdf
+++ b/notes/formalisation.pdf
diff --git a/notes/maximum_likelihood_approach.tex b/notes/maximum_likelihood_approach.tex
new file mode 100644
index 0000000..acc196b
--- /dev/null
+++ b/notes/maximum_likelihood_approach.tex
@@ -0,0 +1,153 @@
+\documentclass[11pt]{article}
+\usepackage{fullpage, amsmath, amssymb, amsthm}
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}{Lemma}
+\newtheorem{remark}{Remark}
+\newtheorem{proposition}{Proposition}
+
+\title{Maximum Likelihood Approach}
+\author{Jean Pouget-Abadie}
+
+\begin{document}
+
+\maketitle
+
+We consider the node $\alpha$. We index the measurements by $i \in [1, n]$. Let $b^i$ be the indicator variable for node $\alpha$ active at the round following measurememt $i$ and let $x^i$ be the vector of active nodes for measurement $i$. Recall that:
+
+\begin{equation}
+\label{eq:probability_of_infection}
+1 - \exp(\langle x^i, \theta \rangle) = \mathbb{P}(\text{node } \alpha \text{ is active at the following round})
+\end{equation}
+
+The likelihood problem can be formulated as such:
+
+\begin{equation}
+\label{eq:main_formulation}
+\min_{\theta \in \mathbb{R}^p} \quad \lambda_n \| \theta \|_1 + \sum^n_{i=1} - b^i \log \left(e^{-\langle x^i, \theta \rangle} - 1 \right) - \langle x^i, \theta \rangle
+\end{equation}
+
+We define $f(\theta):= \sum^n_{i=1} - b^i \log \left(\exp(-\langle x^i, \theta \rangle) \right) - \langle x^i, \theta \rangle$ such that Eq.~\ref{eq:main_formulation} can be rewritten as:
+
+\begin{equation}
+\label{eq:small_formulation}
+\min_{\theta \in \mathbb{R}^p} \quad f(\theta) + \lambda_n \| \theta \|_1
+\end{equation}
+
+We cite the following theorem from \cite{Negahban:2009} (roughly, because the statements of the theorem are either slightly wrong or unclear):
+
+\begin{proposition}
+\label{thm:cited_theorem}
+Let ${\cal C}:=\{\Delta \in \mathbb{R}^p : \exists S \subset [1, n] \ s.t. \ \|\Delta_{S^c}\|_1 \leq 3 \| \Delta_S \|_1 \}$. Suppose that $\theta^*$ is s-sparse, and the following two conditions are met: 
+\begin{equation}
+\lambda_n \geq 2 \|\nabla f(\theta^*) \|_\infty
+\label{eq:lambda_condition}
+\end{equation}
+\begin{equation}
+\forall \Delta \in {\cal C}, \ \Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta \geq \gamma_n \| \Delta \|_2^2
+\label{eq:RSC_condition}
+\end{equation}
+then:
+\begin{equation}
+\| \theta - \theta^* \|_2 \leq \frac{\sqrt{s} \lambda_n}{\gamma_n}
+\end{equation}
+\end{proposition}
+
+It remains to show the two conditions for Proposition~\ref{thm:cited_theorem} are met.
+
+\section*{Condition~\ref{eq:lambda_condition}}
+Condition~\ref{eq:lambda_condition} requires us to find an upper-bound for $ 2 \|\nabla f(\theta^*) \|_\infty$. If we only consider the first measurement of every cascade, this can be done easily. Let $N$ be the number of cascades (to distinguish from $n$ number of total measurements). Begin by noting that:
+
+\begin{equation}
+\nabla_k f(\theta) = \sum^n_{i=1} \frac{b^i x^i_k}{1 - e^{\langle x^i, \theta \rangle}} - \sum^n_{i=1} x^i_k = \sum_{i=1}^n x^k_i \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right)
+\end{equation}
+
+\begin{lemma}
+\label{lem:subgaussian_variable}
+$\nabla f(\theta^*)$ is a $N/p_{\min}$-subgaussian variable, where $p_{\min}$ is the smallest non-zero link to node $\alpha$.
+\end{lemma}
+
+\begin{proof}
+\begin{align}
+\mathbb{E} \left( \nabla_k f(\theta) \right) & = \sum_{i=1}^N \mathbb{E} \left[ x^i_k \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right) \right] \nonumber \\
+& = \sum_{i=1}^N \mathbb{E}_S \left[ \mathbb{E}\left[x^i_k \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right) \middle| S \right] \right] \quad \text{where S is the seed set} \nonumber \\
+& = \sum_{i=1}^N  \mathbb{E}\left[x^i_k \left( \frac{ \mathbb{E}_S \left[ b^i \middle| S \right]}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right)  \right] \nonumber \\
+& = 0
+\end{align}
+Therefore, $\nabla f(\theta^*)$ is the sum of zero-mean variables, upper-bounded by $1/p_{\min}$. It follows that $\nabla f(\theta^*)$ is $N/p_{\min}$-subgaussian.
+\end{proof}
+
+By union bound and characterization of sub-gaussian variables:
+
+\begin{equation}
+\mathbb{P}(\| \nabla f(\theta) \|_{\infty} > \lambda) \leq 2 \exp \left( -\frac{\lambda^2 p_{\min}}{2n} + \log p \right)
+\end{equation}
+
+In conclusion, for $\delta>0$, $\lambda := 2 \sqrt{\frac{n^{\delta + 1} \log p}{p_{\min}}}$ meets Condition~\ref{eq:lambda_condition} with probability $1 - \exp(-n^\delta \log p )$
+
+
+\section*{Condition~\ref{eq:RSC_condition}}
+
+Finally, note that:
+\begin{equation}
+\nabla_{kj} f(\theta) = \sum_{i=1}^n \frac{b^i x_k^i x_j^i e^{\langle x^i, \theta \rangle}}{\left(1 - e^{\langle x^i, \theta \rangle} \right)^2} = \sum_{i=1}^n b^i x_k^i x_j^i \frac{\mathbb{P}(\text{node } \alpha \text { not infected})}{\mathbb{P}(\text{node } \alpha \text { infected})^2}
+\end{equation}
+
+\subsection*{First term}
+We are going to explicitate a constant $\gamma$ such that: $\forall \Delta \in {\cal C}, \Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta \geq \gamma n \|\Delta\|_2^2$. If we denote $p_i := \mathbb{P}(\text{node } \alpha \text { infected})$, we have:
+\begin{equation}
+\Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta = \sum_k \Delta_k^2 \left[ \sum_i b^i x_k^i \frac{1 - p_i}{p_i^2} \right] + 2 \sum_{k< j} \Delta_k \Delta_j \left[ \sum_i b^i x^i_k x^i_j \frac{1 - p_i}{p_i^2}\right]
+\end{equation}
+
+We are first going to find a lower-bound for $\sum_i b^i x_k^i \frac{1 - p_i}{p_i^2}$ by computing a lower-bound on its expectation and concluding with Hoeffding's inequality. If we only consider the first measurement of every cascade and we further suppose that $p_i < 1 - \eta$ no matter the configuration of active nodes (slightly less strong than correlation decay).
+
+\begin{align}
+\mathbb{E}\left(\sum_i b^i x_k^i \frac{1 - p_i} {p_i}^2 \right) & = \sum_i \mathbb{E} \left(x_k^i \frac{1 - p_i} {p_i}^2  \right) \nonumber \\
+& = \sum_i \mathbb{P}(b^i =1 | x_k^i =1) \mathbb{P}(x^i_k =1) \mathbb{E}\left(\frac{1 - p_i}{p_i^2} \middle| b^i =1 = x_k^i \right) \nonumber \\
+& = ATTENTION IL Y A ERREUR A LA LIGNE SUIVANTE: \\
+& \geq \min \left(1 , 1 - (1 - p_{init})^s \right)\cdot p_{init} \cdot \frac{\eta}{(1 - \eta)^2} \nonumber \\
+& \geq s p_{init}^2 \frac{\eta}{(1 - \eta)^2} \quad \text{si }s < \frac{1}{p_{init}} \nonumber \\
+& \geq p_{init} \frac{\eta}{(1 - \eta)^2} \quad \text{si }s > \frac{1}{p_{init}} \nonumber
+\end{align}
+
+We can conclude using the following Hoeffding inequality for independent random variables bounded by $[0, b_i]$ by noticing that our variables are bounded by above by $\frac{1 - p_{\min}}{p_{\min}^2}$
+
+\paragraph{Hoeffding's inequality}
+\begin{equation}
+\label{eq:hoeffding_inequality}
+\mathbb{P} \left(\sum Z_i \geq \mathbb{E}[\sum Z_i] - t \right) \leq \exp\left(- \frac{2 N t^2}{b^2} \right)
+\end{equation}
+
+It follows that for $c<1$ with probability $1 - \exp \left( - n^3 c^2 s^2 p_{init}^4 p_{\min}^4 \frac{\eta^2}{(1 - \eta)^4} \right)$, we have that $$\sum_k \Delta_k^2 \left[ \sum_i b^i x_k^i \frac{1 - p_i}{p_i^2} \right] \geq  \gamma N =: (1 -c) s p_{init}^2 \frac{\eta}{(1 - \eta)^2} N$$
+
+\begin{remark}
+Would it be possible to extend this result using Azuma's inequality on Martingales to not just the first measurement of every cascade?
+\end{remark}
+
+\subsection*{Second term}
+We are now going to find an upper-bound on the term $\sum_i b^i x^i_k x^i_j \frac{1 - p_i}{p_i^2}$.
+
+\section*{Conclusion}
+
+Suppose we show that Condition~\ref{eq:RSC_condition} is met for $\gamma_n = \gamma N$, then we have the following theorems:
+
+\begin{theorem}
+\label{thm:l2_bound}
+Suppose that $\theta^* \in \mathbb{R}^p$ is s-sparse and that we choose $\lambda_n = 2 \sqrt{\frac{n^{\delta + 1} \log p}{p_{\min}}}$ for $\delta >0$, then with probability $1 - \exp(-n^\delta \log p )$, we have
+\begin{equation}
+\|\hat \theta - \theta^* \|_2 \leq \frac{2}{\gamma} \sqrt{\frac{s \log p}{p_{\min} N^{1 - \delta}}}
+\end{equation}
+\end{theorem}
+
+Note that we can choose $\delta = 0$ in high-dimensions since the probability of success will then be $1 - \frac{1}{p} \approx 1$. We can also conclude on support recovery with the following reasoning. 
+
+\begin{theorem}
+\label{thm:support_recovery}
+Suppose that $N$ is chosen such that $\frac{2}{\gamma}\sqrt{\frac{s \log p}{p_{\min} N^{1 -\delta}}} < \eta$ and suppose we only keep as elements of the support of $\theta^*$ the coordinates $\hat \theta_i > \eta$. Then no wrong parent will be included, and all `strong' parents will be included, where `strong' signifies: $\theta^*_i > 2 \eta$.
+\end{theorem}
+
+It follows that we have found an ${\cal O}(s \log p)$ algorithm for recovering the graph, with better constants and fewer assumptions than any previous work.
+
+\bibliography{sparse}
+\bibliographystyle{plain}
+
+\end{document}
+\ No newline at end of file
diff --git a/notes/sparse.bib b/notes/sparse.bib
index 7e682a1..62c6c0f 100644
--- a/notes/sparse.bib
+++ b/notes/sparse.bib
@@ -1,11 +1,71 @@
-@ARTICLE{2005math......3066C,
-   author = {{Candes}, E. and {Romberg}, J. and {Tao}, T.},
-    title = "{Stable Signal Recovery from Incomplete and Inaccurate Measurements}",
-  journal = {ArXiv Mathematics e-prints},
-   eprint = {math/0503066},
- keywords = {Mathematics - Numerical Analysis, 94A12, 41A45, 42A10},
-     year = 2005,
-    month = mar,
-   adsurl = {http://adsabs.harvard.edu/abs/2005math......3066C},
-  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
-}
-\ No newline at end of file
+@article {CandesRomberTao:2006,
+author = {Candès, Emmanuel J. and Romberg, Justin K. and Tao, Terence},
+title = {Stable signal recovery from incomplete and inaccurate measurements},
+journal = {Communications on Pure and Applied Mathematics},
+volume = {59},
+number = {8},
+publisher = {Wiley Subscription Services, Inc., A Wiley Company},
+issn = {1097-0312},
+pages = {1207--1223},
+year = {2006},
+}
+
+
+@inproceedings{GomezRodriguez:2010,
+ author = {Gomez Rodriguez, Manuel and Leskovec, Jure and Krause, Andreas},
+ title = {Inferring Networks of Diffusion and Influence},
+ booktitle = {Proceedings of the 16th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
+ series = {KDD '10},
+ year = {2010},
+ isbn = {978-1-4503-0055-1},
+ location = {Washington, DC, USA},
+ pages = {1019--1028},
+ numpages = {10},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+} 
+
+
+@article{Netrapalli:2012,
+ author = {Netrapalli, Praneeth and Sanghavi, Sujay},
+ title = {Learning the Graph of Epidemic Cascades},
+ journal = {SIGMETRICS Perform. Eval. Rev.},
+ volume = {40},
+ number = {1},
+ month = {June},
+ year = {2012},
+ issn = {0163-5999},
+ pages = {211--222},
+ numpages = {12},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {cascades, epidemics, graph structure learning},
+}
+
+@article{Negahban:2009,
+ author = {Negahban, Sahand N. and Ravikumar, Pradeep and Wrainwright, Martin J. and Yu, Bin},
+ title = {A Unified Framework for High-Dimensional Analysis of M-Estimators with Decomposable Regularizers},
+ Journal = {Statistical Science},
+ year = {2012},
+ month = {December},
+ volume = {27},
+ number = {4},
+ pages = {538--557},
+}
+
+\bibitem{abrahao}
+Abrahao, B., Chierichetti, F., Kleinberg, R., and Panconesi, A.
+\newblock{\it Trace Complexity of Network Inference}
+\newblock In Proc. 19th ACM SIGKDD international conference on Knowledge discovery and data mingin, KDD'13, pages 491--499,
+\newblock 2013
+
+\bibitem{candes}
+Candès, E., and Plan, Y.
+\newblock {\it A Probabilistic and RIPless Theory of Compressed Sensing}
+\newblock Information Theory, IEEE Transactions on, 57(11): 7235--7254,
+\newblock 2011.
+
+\bibitem{snap}
+Leskovec, J., and Krevl, A.,
+\newblock {\it SNAP Datasets: Stanford Large Network Dataset Collection},
+\newblock 2014.
+\ No newline at end of file
author	jeanpouget-abadie <jean.pougetabadie@gmail.com>	2015-01-08 17:51:25 +0100
committer	jeanpouget-abadie <jean.pougetabadie@gmail.com>	2015-01-08 17:51:25 +0100
commit	91287ed4ec2157f1318074c7aa24786d7bcef3a9 (patch)
tree	84e162b591f3010561f393f52db45bfe0f5f808c /notes
parent	ed0b64814bcfdf94e8b85a148d76318d98f68b9b (diff)
download	cascades-91287ed4ec2157f1318074c7aa24786d7bcef3a9.tar.gz