old_work/maximum_likelihood_approach.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230

\documentclass[11pt]{article}
\usepackage{fullpage, amsmath, amssymb, amsthm}
\usepackage{color}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{proposition}{Proposition}

\title{Maximum Likelihood Approach}
\author{Jean Pouget-Abadie}

\begin{document}

\maketitle

We consider the node $\alpha$. We index the measurements by $i \in [1, n]$. Let $b^i$ be the indicator variable for node $\alpha$ active at the round following measurememt $i$ and let $x^i$ be the vector of active nodes for measurement $i$. Recall that:

\begin{equation}
\label{eq:probability_of_infection}
1 - \exp(\langle x^i, \theta \rangle) = \mathbb{P}(\text{node } \alpha \text{ is active at the following round})
\end{equation}

The likelihood problem can be formulated as such:

\begin{equation}
\label{eq:main_formulation}
\min_{\theta \in \mathbb{R}^p} \quad \lambda_n \| \theta \|_1 + \sum^n_{i=1} - b^i \log \left(e^{-\langle x^i, \theta \rangle} - 1 \right) - \langle x^i, \theta \rangle
\end{equation}

We define $f(\theta):= \sum^n_{i=1} - b^i \log \left(\exp(-\langle x^i, \theta \rangle) \right) - \langle x^i, \theta \rangle$ such that Eq.~\ref{eq:main_formulation} can be rewritten as:

\begin{equation}
\label{eq:small_formulation}
\min_{\theta \in \mathbb{R}^p} \quad f(\theta) + \lambda_n \| \theta \|_1
\end{equation}

We cite the following theorem from \cite{Negahban:2009} (roughly, because the statements of the theorem are either slightly wrong or unclear):

\begin{proposition}
\label{thm:cited_theorem}
Let ${\cal C}:=\{\Delta \in \mathbb{R}^p : \exists S \subset [1, n] \ s.t. \ \|\Delta_{S^c}\|_1 \leq 3 \| \Delta_S \|_1 \}$. Suppose that $\theta^*$ is s-sparse, and the following two conditions are met: 
\begin{equation}
\lambda_n \geq 2 \|\nabla f(\theta^*) \|_\infty
\label{eq:lambda_condition}
\end{equation}
\begin{equation}
\forall \Delta \in {\cal C}, \ \Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta \geq \gamma_n \| \Delta \|_2^2
\label{eq:RSC_condition}
\end{equation}
then:
\begin{equation}
\| \theta - \theta^* \|_2 \leq \frac{\sqrt{s} \lambda_n}{\gamma_n}
\end{equation}
\end{proposition}

It remains to show the two conditions for Proposition~\ref{thm:cited_theorem} are met.

\section*{Condition~\ref{eq:lambda_condition}}
Condition~\ref{eq:lambda_condition} requires us to find an upper-bound for $ 2 \|\nabla f(\theta^*) \|_\infty$. If we only consider the first measurement of every cascade, this can be done easily. Let $N$ be the number of cascades (to distinguish from $n$ number of total measurements). Begin by noting that:

\begin{equation}
\nabla_k f(\theta) = \sum^n_{i=1} \frac{b^i x^i_k}{1 - e^{\langle x^i, \theta \rangle}} - \sum^n_{i=1} x^i_k = \sum_{i=1}^n x^k_i \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right)
\end{equation}

\begin{lemma}
\label{lem:subgaussian_variable}
$\nabla f(\theta^*)$ is a $N/p_{\min}$-subgaussian variable, where $p_{\min}$ is the smallest non-zero link to node $\alpha$.
\end{lemma}

\begin{proof}
\begin{align}
\mathbb{E} \left( \nabla_k f(\theta) \right) & = \sum_{i=1}^N \mathbb{E} \left[ x^i_k \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right) \right] \nonumber \\
& = \sum_{i=1}^N \mathbb{E}_S \left[ \mathbb{E}\left[x^i_k \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right) \middle| S \right] \right] \quad \text{where S is the seed set} \nonumber \\
& = \sum_{i=1}^N  \mathbb{E}\left[x^i_k \left( \frac{ \mathbb{E}_S \left[ b^i \middle| S \right]}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right)  \right] \nonumber \\
& = 0
\end{align}
Therefore, $\nabla f(\theta^*)$ is the sum of zero-mean variables, upper-bounded by $1/p_{\min}$. It follows that $\nabla f(\theta^*)$ is $N/p_{\min}$-subgaussian.
\end{proof}

By union bound and characterization of sub-gaussian variables:

\begin{equation}
\mathbb{P}(\| \nabla f(\theta) \|_{\infty} > \lambda) \leq 2 \exp \left( -\frac{\lambda^2 p_{\min}}{2n} + \log p \right)
\end{equation}

In conclusion, for $\delta>0$, $\lambda := 2 \sqrt{\frac{n^{\delta + 1} \log p}{p_{\min}}}$ meets Condition~\ref{eq:lambda_condition} with probability $1 - \exp(-n^\delta \log p )$


\section*{Condition~\ref{eq:RSC_condition}}

Note that:
\begin{equation}
\nabla_{kj} f(\theta) = \sum_{i=1}^n \frac{b^i x_k^i x_j^i e^{\langle x^i, \theta \rangle}}{\left(1 - e^{\langle x^i, \theta \rangle} \right)^2} = \sum_{i=1}^n b^i x_k^i x_j^i \frac{\mathbb{P}(\text{node } \alpha \text { not infected})}{\mathbb{P}(\text{node } \alpha \text { infected})^2}
\end{equation}


We are going to explicitate a constant $\gamma$ such that: $\forall \Delta \in {\cal C}, \Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta \geq \gamma n \|\Delta\|_2^2$. 

\paragraph{Notation} Let $p_i := \mathbb{P}(\text{node } \alpha \text { infected})$. Let $Z^i_k := b^i x^i_k \frac{1-p_i}{p_i^2}$and let $Z^i_{k,j} := b^i x^i_k x^i_j \frac{1-p_i}{p_i^2}$. We also define $Z_k := \sum_i Z^i_k$ and $Z_{k,j} := \sum_i Z^i_{k,j}$.

\begin{align}
\Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta & = \sum_k \Delta_k^2 \left[ \sum_i b^i x_k^i \frac{1 - p_i}{p_i^2} \right] + 2 \sum_{k< j} \Delta_k \Delta_j \left[ \sum_i b^i x^i_k x^i_j \frac{1 - p_i}{p_i^2}\right] \nonumber \\ 
& = \sum_k \Delta_k^2 Z_k + 2 \sum_{k< j} \Delta_k \Delta_j Z_{k,j} \nonumber
\end{align}


\begin{lemma}
\label{lem:first_term}
Suppose that $\forall k$, $\mathbb{E}_{S(k)} \left[ \frac{1}{p_i} \right] \geq 1 + \alpha$, then with probability greater than $1 - 2 p e^{-2 \alpha^2 (1-c)^2p_{\min}^2 p_{\text{init}}^2 N}$, $$\forall k, \ Z_k > c \alpha p_{\text{init}} N$$
\end{lemma}

\begin{proof}
Let $S(k)$ denote the active set conditioned on the fact that node $k$ is active AND that one parent is active. We denote $p_{S(k)}$ the probability that the active set verifies the previous two conditions.
\begin{align}
\nonumber
\mathbb{E}[Z^i_k] & = p_{S(k)} \mathbb{E}_{S(k)} \left[ \mathbb{E}[b^i | S(k)] \frac{1 - p_i}{p_i^2} \right] \\ \nonumber
& = p_{S(k)} \left( \mathbb{E}_{S(k)} \left[ \frac{1}{p_i} \right] - 1 \right) \\ \nonumber
& \geq \alpha p_{S(k)} \quad \text{by assumption}
\end{align}

Note that $|Z^i_k| < \frac{1}{p_{\text{min}}^2}$ {\it a.s.}. By Hoeffding's first inequality, for $0<c <1$,

\begin{align}
\nonumber
\mathbb{P}\left(Z_k < c \alpha p_{\text{init}} N \right) & < 2 e^{- \frac{2(1-c)^2}{Nb^2} \left( \mathbb{E}_{S(k)} \left[ \frac{1}{p_i} \right] - 1 \right)^2} \\ \nonumber
& < 2 e^{-2 \alpha^2 (1-c)^2p_{\min}^2 p_{S(k)}^2 N}
\end{align}

We conclude by union bound.
\end{proof}

\begin{lemma}
Suppose that $\forall k,j$, $1 + \alpha \leq \mathbb{E}_{S(k, j)} \left[ \frac{1}{p_i} \right] \leq 1 + \beta$, then with probability greater than $1 - pe^{-2 \alpha^2 (1-c)^2p_{\min}^2 p_{S(k,j)}^2 N}$, $$\forall k,j, \ Z_{k,j} < c \beta p_{S(k,j))} N$$
\end{lemma}

\begin{proof}
We follow the same reasoning as Lemma~\ref{lem:first_term}:
\begin{align}
\nonumber
\mathbb{E}[Z^i_{k,j}] & = p_{S(k,j)} \left( \mathbb{E}_{S(k,j)} \left[ \frac{1}{p_i} \right] - 1 \right) \\ \nonumber
& \leq \beta p_{S(k,j)} \quad \text{by assumption}
\end{align}

By Hoeffding's second inequality, for $0 < c < 1$,
\begin{align}
\nonumber
\mathbb{P}\left(Z_{k,j} > c \beta p_{S(k,j)} N \right) & \leq e^{- \frac{2(1-c)^2}{Nb^2} \left( \mathbb{E}_{S(k,j)} \left[ \frac{1}{p_i} \right] - 1 \right)^2} \\ \nonumber
& \leq e^{-2 \alpha^2 (1-c)^2p_{\min}^2 p_{S(k,j)}^2 N} 
\end{align}
We conclude by union bound.
\end{proof}

\begin{proposition}
Suppose that $\forall k,j$, $1 + \alpha \leq \mathbb{E}_{S(k, j)} \left[ \frac{1}{p_i} \right] \leq 1 + \beta$, then with probability greater than $1 - XXX$, condition~\ref{eq:RSC_condition} is met with $\gamma_n = \gamma n$ where $\gamma := \alpha p_{S(k)} - 16 \sqrt{s} \beta p_{S(k,j)}$
\end{proposition}

\begin{proof}
(Sketch) By the triangle inequality followed by MacLaurin's inequality,
\begin{align}
\mid \frac{2}{\binom{n}{2}} \sum_{i<j} \Delta_k \Delta_j \mid & \leq \frac{1}{n^2} \sum_k \mid \Delta_k \mid \nonumber \\
\mid 2 \sum_{i<j} \Delta_k \Delta_j \mid & \leq \|\Delta\|_1 \leq 4 \sqrt{s} \| \Delta \|_2 \quad \text{ since } \Delta \in {\cal C} \nonumber
\end{align}
\end{proof}

\paragraph{Hoeffding's inequality}

For $t \in \mathbb{R}$ and independent variables $Z_i$ such that $|Z_i|<b$ {\it a.s.}, we have:
\begin{align}
\label{eq:hoeffding_inequality}
\mathbb{P} \left( \middle| \sum_{i=1}^N Z_i - \mathbb{E}\left[\sum_{i=1}^N Z_i \right] \middle| \geq t \right) \leq 2 \exp\left(- \frac{2 t^2}{N b^2} \right) \nonumber \\
\mathbb{P} \left(\sum_{i=1}^N Z_i \geq \mathbb{E}\left[\sum_{i=1}^N Z_i \right] + t \right) \leq \exp\left(- \frac{2 t^2}{N b^2} \right) \nonumber
\end{align}


% \subsection*{First term}

% We are first going to find a lower-bound for $\sum_i b^i x_k^i \frac{1 - p_i}{p_i^2}$ by computing a lower-bound on its expectation and concluding with Hoeffding's inequality. If we only consider the first measurement of every cascade and we further suppose that $p_i < 1 - \eta$ no matter the configuration of active nodes (slightly less strong than correlation decay).

% \begin{align}
% \mathbb{E}\left(\sum_i b^i x_k^i \frac{1 - p_i} {p_i}^2 \right) & = \sum_i \mathbb{E} \left(x_k^i \frac{1 - p_i} {p_i}^2  \right) \nonumber \\
% & = \sum_i \mathbb{P}(b^i =1 | x_k^i =1) \mathbb{P}(x^i_k =1) \mathbb{E}\left(\frac{1 - p_i}{p_i^2} \middle| b^i =1 = x_k^i \right) \nonumber \\
% & \geq \sum_{i=1}^n p_{\min} \cdot \min \left(1 , 1 - (1 - p_{init})^s \right)\cdot p_{init} \cdot \frac{\eta}{(1 - \eta)^2} \nonumber \\
% & \geq s p_{init}^2 p_{\min} \frac{\eta}{(1 - \eta)^2} \quad \text{si }s < \frac{1}{p_{init}} \nonumber \\
% & \geq p_{init} p_{\min} \frac{\eta}{(1 - \eta)^2} \quad \text{si }s > \frac{1}{p_{init}} \nonumber
% \end{align}

% We can conclude using the following Hoeffding inequality for independent random variables bounded by $[0, b_i]$ by noticing that our variables are bounded by above by $\frac{1 - p_{\min}}{p_{\min}^2}$

% \paragraph{Hoeffding's inequality}
% \begin{equation}
% \label{eq:hoeffding_inequality}
% \mathbb{P} \left(\sum Z_i \geq \mathbb{E}[\sum Z_i] - t \right) \leq \exp\left(- \frac{2 N t^2}{b^2} \right)
% \end{equation}

% It follows that for $c<1$ with probability $1 - \exp \left( - n^3 c^2 s^2 p_{init}^4 p_{\min}^6 \frac{\eta^2}{(1 - \eta)^4} \right)$, we have that $$\sum_k \Delta_k^2 \left[ \sum_i b^i x_k^i \frac{1 - p_i}{p_i^2} \right] \geq  \gamma N =: (1 -c) s p_{init}^2 p_{\min} \frac{\eta}{(1 - \eta)^2} N$$

% \begin{remark}
% Would it be possible to extend this result using Azuma's inequality on Martingales to not just the first measurement of every cascade?
% \end{remark}

% \subsection*{Second term}
% We are now going to find an upper-bound on the term $\sum_i b^i x^i_k x^i_j \frac{1 - p_i}{p_i^2}$.


\section*{Conclusion}

Suppose we show that Condition~\ref{eq:RSC_condition} is met for $\gamma_n = \gamma N$, then we have the following theorems:

\begin{theorem}
\label{thm:l2_bound}
Suppose that $\theta^* \in \mathbb{R}^p$ is s-sparse and that we choose $\lambda_n = 2 \sqrt{\frac{n^{\delta + 1} \log p}{p_{\min}}}$ for $\delta >0$, then with probability $1 - \exp(-n^\delta \log p )$, we have
\begin{equation}
\|\hat \theta - \theta^* \|_2 \leq \frac{2}{\gamma} \sqrt{\frac{s \log p}{p_{\min} N^{1 - \delta}}}
\end{equation}
\end{theorem}

Note that we can choose $\delta = 0$ in high-dimensions since the probability of success will then be $1 - \frac{1}{p} \approx 1$. We can also conclude on support recovery with the following reasoning. 

\begin{theorem}
\label{thm:support_recovery}
Suppose that $N$ is chosen such that $\frac{2}{\gamma}\sqrt{\frac{s \log p}{p_{\min} N^{1 -\delta}}} < \eta$ and suppose we only keep as elements of the support of $\theta^*$ the coordinates $\hat \theta_i > \eta$. Then no wrong parent will be included, and all `strong' parents will be included, where `strong' signifies: $\theta^*_i > 2 \eta$.
\end{theorem}

It follows that we have found an ${\cal O}(s \log p)$ algorithm for recovering the graph, with better constants and fewer assumptions than any previous work.

\bibliography{sparse}
\bibliographystyle{plain}

\end{document}