notes/maximum_likelihood_approach.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

\documentclass[11pt]{article}
\usepackage{fullpage, amsmath, amssymb, amsthm}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{proposition}{Proposition}

\title{Maximum Likelihood Approach}
\author{Jean Pouget-Abadie}

\begin{document}

\maketitle

We consider the node $\alpha$. We index the measurements by $i \in [1, n]$. Let $b^i$ be the indicator variable for node $\alpha$ active at the round following measurememt $i$ and let $x^i$ be the vector of active nodes for measurement $i$. Recall that:

\begin{equation}
\label{eq:probability_of_infection}
1 - \exp(\langle x^i, \theta \rangle) = \mathbb{P}(\text{node } \alpha \text{ is active at the following round})
\end{equation}

The likelihood problem can be formulated as such:

\begin{equation}
\label{eq:main_formulation}
\min_{\theta \in \mathbb{R}^p} \quad \lambda_n \| \theta \|_1 + \sum^n_{i=1} - b^i \log \left(e^{-\langle x^i, \theta \rangle} - 1 \right) - \langle x^i, \theta \rangle
\end{equation}

We define $f(\theta):= \sum^n_{i=1} - b^i \log \left(\exp(-\langle x^i, \theta \rangle) \right) - \langle x^i, \theta \rangle$ such that Eq.~\ref{eq:main_formulation} can be rewritten as:

\begin{equation}
\label{eq:small_formulation}
\min_{\theta \in \mathbb{R}^p} \quad f(\theta) + \lambda_n \| \theta \|_1
\end{equation}

We cite the following theorem from \cite{Negahban:2009} (roughly, because the statements of the theorem are either slightly wrong or unclear):

\begin{proposition}
\label{thm:cited_theorem}
Let ${\cal C}:=\{\Delta \in \mathbb{R}^p : \exists S \subset [1, n] \ s.t. \ \|\Delta_{S^c}\|_1 \leq 3 \| \Delta_S \|_1 \}$. Suppose that $\theta^*$ is s-sparse, and the following two conditions are met: 
\begin{equation}
\lambda_n \geq 2 \|\nabla f(\theta^*) \|_\infty
\label{eq:lambda_condition}
\end{equation}
\begin{equation}
\forall \Delta \in {\cal C}, \ \Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta \geq \gamma_n \| \Delta \|_2^2
\label{eq:RSC_condition}
\end{equation}
then:
\begin{equation}
\| \theta - \theta^* \|_2 \leq \frac{\sqrt{s} \lambda_n}{\gamma_n}
\end{equation}
\end{proposition}

It remains to show the two conditions for Proposition~\ref{thm:cited_theorem} are met.

\section*{Condition~\ref{eq:lambda_condition}}
Condition~\ref{eq:lambda_condition} requires us to find an upper-bound for $ 2 \|\nabla f(\theta^*) \|_\infty$. If we only consider the first measurement of every cascade, this can be done easily. Let $N$ be the number of cascades (to distinguish from $n$ number of total measurements). Begin by noting that:

\begin{equation}
\nabla_k f(\theta) = \sum^n_{i=1} \frac{b^i x^i_k}{1 - e^{\langle x^i, \theta \rangle}} - \sum^n_{i=1} x^i_k = \sum_{i=1}^n x^k_i \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right)
\end{equation}

\begin{lemma}
\label{lem:subgaussian_variable}
$\nabla f(\theta^*)$ is a $N/p_{\min}$-subgaussian variable, where $p_{\min}$ is the smallest non-zero link to node $\alpha$.
\end{lemma}

\begin{proof}
\begin{align}
\mathbb{E} \left( \nabla_k f(\theta) \right) & = \sum_{i=1}^N \mathbb{E} \left[ x^i_k \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right) \right] \nonumber \\
& = \sum_{i=1}^N \mathbb{E}_S \left[ \mathbb{E}\left[x^i_k \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right) \middle| S \right] \right] \quad \text{where S is the seed set} \nonumber \\
& = \sum_{i=1}^N  \mathbb{E}\left[x^i_k \left( \frac{ \mathbb{E}_S \left[ b^i \middle| S \right]}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right)  \right] \nonumber \\
& = 0
\end{align}
Therefore, $\nabla f(\theta^*)$ is the sum of zero-mean variables, upper-bounded by $1/p_{\min}$. It follows that $\nabla f(\theta^*)$ is $N/p_{\min}$-subgaussian.
\end{proof}

By union bound and characterization of sub-gaussian variables:

\begin{equation}
\mathbb{P}(\| \nabla f(\theta) \|_{\infty} > \lambda) \leq 2 \exp \left( -\frac{\lambda^2 p_{\min}}{2n} + \log p \right)
\end{equation}

In conclusion, for $\delta>0$, $\lambda := 2 \sqrt{\frac{n^{\delta + 1} \log p}{p_{\min}}}$ meets Condition~\ref{eq:lambda_condition} with probability $1 - \exp(-n^\delta \log p )$


\section*{Condition~\ref{eq:RSC_condition}}

Finally, note that:
\begin{equation}
\nabla_{kj} f(\theta) = \sum_{i=1}^n \frac{b^i x_k^i x_j^i e^{\langle x^i, \theta \rangle}}{\left(1 - e^{\langle x^i, \theta \rangle} \right)^2} = \sum_{i=1}^n b^i x_k^i x_j^i \frac{\mathbb{P}(\text{node } \alpha \text { not infected})}{\mathbb{P}(\text{node } \alpha \text { infected})^2}
\end{equation}

\subsection*{First term}
We are going to explicitate a constant $\gamma$ such that: $\forall \Delta \in {\cal C}, \Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta \geq \gamma n \|\Delta\|_2^2$. If we denote $p_i := \mathbb{P}(\text{node } \alpha \text { infected})$, we have:
\begin{equation}
\Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta = \sum_k \Delta_k^2 \left[ \sum_i b^i x_k^i \frac{1 - p_i}{p_i^2} \right] + 2 \sum_{k< j} \Delta_k \Delta_j \left[ \sum_i b^i x^i_k x^i_j \frac{1 - p_i}{p_i^2}\right]
\end{equation}

We are first going to find a lower-bound for $\sum_i b^i x_k^i \frac{1 - p_i}{p_i^2}$ by computing a lower-bound on its expectation and concluding with Hoeffding's inequality. If we only consider the first measurement of every cascade and we further suppose that $p_i < 1 - \eta$ no matter the configuration of active nodes (slightly less strong than correlation decay).

\begin{align}
\mathbb{E}\left(\sum_i b^i x_k^i \frac{1 - p_i} {p_i}^2 \right) & = \sum_i \mathbb{E} \left(x_k^i \frac{1 - p_i} {p_i}^2  \right) \nonumber \\
& = \sum_i \mathbb{P}(b^i =1 | x_k^i =1) \mathbb{P}(x^i_k =1) \mathbb{E}\left(\frac{1 - p_i}{p_i^2} \middle| b^i =1 = x_k^i \right) \nonumber \\
& = ATTENTION IL Y A ERREUR A LA LIGNE SUIVANTE: \\
& \geq \min \left(1 , 1 - (1 - p_{init})^s \right)\cdot p_{init} \cdot \frac{\eta}{(1 - \eta)^2} \nonumber \\
& \geq s p_{init}^2 \frac{\eta}{(1 - \eta)^2} \quad \text{si }s < \frac{1}{p_{init}} \nonumber \\
& \geq p_{init} \frac{\eta}{(1 - \eta)^2} \quad \text{si }s > \frac{1}{p_{init}} \nonumber
\end{align}

We can conclude using the following Hoeffding inequality for independent random variables bounded by $[0, b_i]$ by noticing that our variables are bounded by above by $\frac{1 - p_{\min}}{p_{\min}^2}$

\paragraph{Hoeffding's inequality}
\begin{equation}
\label{eq:hoeffding_inequality}
\mathbb{P} \left(\sum Z_i \geq \mathbb{E}[\sum Z_i] - t \right) \leq \exp\left(- \frac{2 N t^2}{b^2} \right)
\end{equation}

It follows that for $c<1$ with probability $1 - \exp \left( - n^3 c^2 s^2 p_{init}^4 p_{\min}^4 \frac{\eta^2}{(1 - \eta)^4} \right)$, we have that $$\sum_k \Delta_k^2 \left[ \sum_i b^i x_k^i \frac{1 - p_i}{p_i^2} \right] \geq  \gamma N =: (1 -c) s p_{init}^2 \frac{\eta}{(1 - \eta)^2} N$$

\begin{remark}
Would it be possible to extend this result using Azuma's inequality on Martingales to not just the first measurement of every cascade?
\end{remark}

\subsection*{Second term}
We are now going to find an upper-bound on the term $\sum_i b^i x^i_k x^i_j \frac{1 - p_i}{p_i^2}$.

\section*{Conclusion}

Suppose we show that Condition~\ref{eq:RSC_condition} is met for $\gamma_n = \gamma N$, then we have the following theorems:

\begin{theorem}
\label{thm:l2_bound}
Suppose that $\theta^* \in \mathbb{R}^p$ is s-sparse and that we choose $\lambda_n = 2 \sqrt{\frac{n^{\delta + 1} \log p}{p_{\min}}}$ for $\delta >0$, then with probability $1 - \exp(-n^\delta \log p )$, we have
\begin{equation}
\|\hat \theta - \theta^* \|_2 \leq \frac{2}{\gamma} \sqrt{\frac{s \log p}{p_{\min} N^{1 - \delta}}}
\end{equation}
\end{theorem}

Note that we can choose $\delta = 0$ in high-dimensions since the probability of success will then be $1 - \frac{1}{p} \approx 1$. We can also conclude on support recovery with the following reasoning. 

\begin{theorem}
\label{thm:support_recovery}
Suppose that $N$ is chosen such that $\frac{2}{\gamma}\sqrt{\frac{s \log p}{p_{\min} N^{1 -\delta}}} < \eta$ and suppose we only keep as elements of the support of $\theta^*$ the coordinates $\hat \theta_i > \eta$. Then no wrong parent will be included, and all `strong' parents will be included, where `strong' signifies: $\theta^*_i > 2 \eta$.
\end{theorem}

It follows that we have found an ${\cal O}(s \log p)$ algorithm for recovering the graph, with better constants and fewer assumptions than any previous work.

\bibliography{sparse}
\bibliographystyle{plain}

\end{document}