paper/sections/results.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228

In this section, we apply the sparse recovery framework to analyze under which
assumptions our program~\eqref{eq:pre-mle} recovers the true parameter
$\theta_i$ of the cascade model. Furthermore, if we can estimate $\theta_i$ to
a sufficiently good accuracy, it is then possible to recover the support of
$\theta_i$ by simple thresholding, which provides a solution to the standard
Graph Inference problem.

We will first give results in the exactly sparse setting in which $\theta_i$
has a support of size exactly $s$. We will then relax this sparsity constraint
and give results in the \emph{stable recovery} setting where $\theta_i$ is
approximately $s$-sparse.

As mentioned in Section~\ref{sec:mle}, the maximum likelihood estimation
program is decomposable. We will henceforth focus on a single node $i\in V$ and
omit the subscript $i$ in the notations when there is no ambiguity. The
recovery problem is now the one of estimating a single vector $\theta^*$ from
a set $\mathcal{T}$ of observations. We will write $n\defeq |\mathcal{T}|$.

\begin{comment}
As mentioned previously, our objective is twofold:
\begin{enumerate}
    \item We seek to recover the edges of the graph. In other words, for each
        node $j$, we wish to identify the set of indices $\{i: \Theta_{ij} \neq
        0\}$ . 
    \item We seek to recover the edge weights $\Theta$ of ${\cal G}$ i.e.
        upper-bound the error $\|\hat \Theta - \Theta \|_2$.
\end{enumerate}
It is easy to see that solving the second objective allows us to solve the
first. If $\|\hat \Theta - \Theta \|_2$ is sufficiently small, we can recover
all large coordinates of $\Theta$ by keeping only the coordinates above
a certain threshold.
\end{comment}

\subsection{Main Theorem}
\label{sec:main_theorem}

In this section, we analyze the case where $\theta^*$ is exactly sparse. We
write $S\defeq\text{supp}(\theta^*)$ and $s=|S|$. Our main theorem will rely
on the now standard \emph{restricted eigenvalue condition}.

\begin{definition}
    Let $\Sigma\in\mathcal{S}_m(\reals)$ be a real symmetric matrix and $S$ be
    a subset of $\{1,\ldots,m\}$. Defining $\mathcal{C}(S)\defeq
    \{X\in\reals^m\,:\,\|X\|_1\leq 1\text{ and } \|X_{S^c}\|_1\leq
    3\|X_S\|_1\}$. We say that $\Sigma$ satisfies the
    $(S,\gamma)$-\emph{restricted eigenvalue condition} iff:
\begin{equation}
    \forall X \in {\cal C(S)}, \| \Sigma X \|_2^2 \geq \gamma \|X\|_2^2 
\tag{RE}
\label{eq:re}
\end{equation}
\end{definition}

A discussion of this assumption in the context of generalized linear cascade
models can be found in Section~\ref{sec:}.

We will also need the following assumption on the inverse link function $f$ of
the generalized linear cascade model:
\begin{equation}
    \tag{LF}
    \max\left(\left|\frac{f'}{f}(\inprod{\theta^*}{x})\right|,
    \left|\frac{f'}{1-f}(\inprod{\theta^*}{x})\right|\right)\leq
    \frac{1}{\alpha}
\end{equation}
for all $x\in\reals^m$ such that $f(\inprod{\theta^*}{x})\notin\{0,1\}$.

It is easy to verify that this will be true if for all
$(i,j)\in E$, $\delta\leq p_{i,j}\leq 1-\delta$ in the Voter model and when
$p_{i,j}\leq 1-\delta$ in the Independent Cascade model.

\begin{theorem}
\label{thm:main}
Assume the Hessian $\nabla^2\mathcal{L}(\theta^*)$ satisfies the
$(S,\gamma)$-{\bf(RE)} for some $\gamma > 0$ and that {\bf (LF)} holds for
some $\alpha > 0$. For any $\delta\in(0,1)$, let $\hat{\theta}$ be the solution
of \eqref{eq:pre-mle} with $\lambda \defeq 2\sqrt{\frac{\log m}{\alpha n^{1
- \delta}}}$ then:
\begin{equation}
    \label{eq:sparse}
    \|\hat \theta - \theta^* \|_2
    \leq \frac{3}{\gamma} \sqrt{\frac{s \log m}{\alpha n^{1-\delta}}}
\quad
\text{w.p.}\;1-\frac{1}{e^{n^\delta \log m}}
\end{equation}
\end{theorem}

This theorem is a consequence of Theorem~1 in \cite{Negahban:2009} which gives
a bound on the convergence rate of regularized estimators. We state their
theorem in the context of $\ell_1$ regularization in Lemma~\ref{lem:negahban}.

\begin{lemma}
    \label{lem:negahban}
Let ${\cal C}(S) \defeq \{ \Delta \in \mathbb{R}^m\,|\,\|\Delta_S\|_1 \leq
3 \|\Delta_{S^c}\|_1 \}$. Suppose that:
\begin{multline}
    \label{eq:rc}
    \forall \Delta \in {\cal C}(S), \;
    {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*)\\
    - \inprod{\Delta {\cal L}(\theta^*)}{\Delta}
    \geq \kappa_{\cal L} \|\Delta\|_2^2 - \tau_{\cal L}^2(\theta^*)
\end{multline}
for some $\kappa_{\cal L} > 0$ and function $\tau_{\cal L}$. Finally suppose
that $\lambda \geq 2 \|\nabla {\cal L}(\theta^*)\|_{\infty}$, then if
$\hat{\theta}_\lambda$ is the solution of \eqref{eq:pre-mle}:
\begin{displaymath}
\|\hat \theta_\lambda - \theta^* \|_2^2
\leq 9 \frac{\lambda^2 s}{\kappa_{\cal L}}
+ \frac{\lambda}{\kappa_{\cal L}^2} 2 \tau^2_{\cal L}(\theta^*)
\end{displaymath}
\end{lemma}

To prove Theorem~\ref{thm:main}, we apply Lemma~\ref{lem:negahban} with
$\tau_{\mathcal{L}}=0$. Since $\mathcal{L}$ is twice differentiable and convex,
assumption \eqref{eq:rc} is implied by the restricted eigenvalue condition
\eqref{eq:re}. The upper bound on the $\ell_{\infty}$ norm of
$\nabla\mathcal{L}(\theta^*)$ is given by Lemma~\ref{lem:ub}.

\begin{lemma}
\label{lem:ub}
Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$:
\begin{displaymath}
    \|\nabla {\cal L}(\theta^*)\|_{\infty}
    \leq 2 \sqrt{\frac{\log m}{\alpha n^{1 - \delta}}}
    \quad
    \text{w.p.}\; 1-\frac{1}{e^{n^\delta \log m}}
\end{displaymath}
\end{lemma}

\begin{proof}
The gradient of $\mathcal{L}$ is given by:
\begin{multline*}
    \nabla \mathcal{L}(\theta^*) =
    \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
        x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\
    - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg]
\end{multline*}

Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of
$\nabla\mathcal{L}(\theta^*)$.  Writing:
$\partial_j\mathcal{L}(\theta^*)
= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
$\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.

Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq \frac{1}{\alpha}$
and we can apply Azuma's inequality to $Z_t$:
\begin{displaymath}
    \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
    2\exp\left(\frac{-\lambda^2\alpha}{2n}\right)
\end{displaymath}

Applying a union bound to have the previous inequality hold for all coordinates
of $\nabla\mathcal{L}(\theta)$ implies:
\begin{align*}
    \P\big[\|\nabla\mathcal{L}(\theta)\|_{\infty}\geq \lambda \big]
    &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right)
\end{align*}
Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes the
proof.
\end{proof}

\begin{corollary}
\label{cor:variable_selection}
Assume that ${\bf (RE)}$ holds with $\gamma > 0$ and that $\theta$ is s-sparse. Consider the set $\hat {\cal S}_\eta \defeq \{ i \in [1..p] : \hat \theta_i > \eta\}$ for $\eta > 0$. For $\epsilon>0$ and $\epsilon < \eta$, let ${\cal S}^*_{\eta + \epsilon} \defeq \{ i \in [1..p] :\theta_i > \eta +\epsilon \}$ be the set of all true `strong' parents. Suppose the number of measurements verifies:
\begin{equation}
n > \frac{36}{p_{\min}\gamma^2 \epsilon^2} s \log m
\end{equation}
Then with probability $1-\frac{1}{m}$, ${\cal S}^*_{\eta + \epsilon} \subset
\hat {\cal S}_\eta \subset {\cal S}^*$. In other words we recover all `strong'
parents and no `false' parents. 
\end{corollary}

\begin{proof}
Suppose $\theta$ is exactly s-sparse. By choosing $\delta = 0$, if $n>\frac{36}{p_{\min}\gamma^2 \epsilon^2} s \log m$, then $\|\hat \theta-\theta\|_2 < \epsilon < \eta$ with probability $1-\frac{1}{m}$. If $\theta_i = 0$ and $\hat \theta > \eta$, then $\|\hat \theta - \theta\|_2 \geq |\hat \theta_i-\theta_j| > \eta$, which is a contradiction. Therefore we get no false positives. If $\theta_i = \eta + \epsilon$, then $|\hat \theta_i - (\eta+\epsilon)| < \epsilon/2 \implies \theta_j > \eta + \epsilon/2$. Therefore, we get all strong parents.
\end{proof}

Note that if we want \emph{perfect recovery} of all edges, we must estimate $p_{\min}$ within $\epsilon'$ and set $\eta \defeq p_{\min} - \epsilon'$

\subsection{Relaxing the Sparsity Constraint}
\label{sec:relaxing_sparsity}

In practice, exact sparsity is rarely verified. For social networks in particular, it is more realistic to assume that each node has few strong `parents' and many `weak' parents. Rather than obtaining an impossibility result in this situation, we show that we pay a small price for relaxing the sparsity constraint. If we let $\theta^*_{\lfloor s \rfloor}$ be the best s-sparse approximation to $\theta^*$ defined as 
$$\theta^*_{\lfloor s \rfloor} \defeq \min_{\|\theta\|_0 \leq s} \|\theta - \theta^*\|_1$$
then we pay a price proportional to $\|\theta^*_s\|_1$ for recovering the weights of non-exactly sparse vectors, i.e. the sum of the coefficients of $\theta$ save for the $s$ largest. In other words, the closer $\theta^*$ is to being sparse, the smaller the price. These results are formalized in the following theorem, which is also a consequence of Theorem 1 in \cite{Negahban:2009} and lemma~\ref{lem:icc_lambda_upper_bound}.

\begin{theorem}
\label{thm:approx_sparse}
Let $\theta^*_{\lfloor s \rfloor}$ be the best s-sparse approximation to the true vector $\theta^*$. Suppose the {\bf(RE)} assumption holds for the Hessian $\nabla^2 f(\theta^*)$ and for the following set 
\begin{align}
\nonumber
{\cal C}' \defeq & \{X \in \mathbb{R}^p : \|X_{S^c}\|_1 \leq 3 \|X_S\|_1 + 4 \|\theta^*_{\lfloor s \rfloor}\|_1 \} \\ \nonumber
& \cap \{ \|X\|_1 \leq 1 \}
\end{align}
By solving \eqref{eq:pre-mle} for $\lambda \defeq 2\sqrt{\frac{\log m}{n^{1 - \delta} p_{\min}}}$ we have:
\begin{align}
\|\hat p - p^* \|_2 \leq 3 \frac{\sqrt{s}\lambda_n}{\gamma_n} + 2 \sqrt{\frac{\lambda_n}{\gamma_n}} \|p^*_{\lfloor s \rfloor}\|_1
\end{align}
\end{theorem}

As before, edge recovery is a consequence of upper-bounding $\|\theta - \hat \theta\|_2$

\begin{corollary}
If $\theta$ is not exactly s-sparse and the number of measurements verifies:
\begin{equation}
n > \frac{36 \| p^*_{\lfloor s\rfloor}\|_1}{p_{\min}\gamma^2 \epsilon^4} s \log m
\end{equation}
then similarly: ${\cal S}^*_{\eta + \epsilon} \subset \hat {\cal S}_\eta \subset {\cal S}^*$ w.h.p.
\end{corollary}


\subsection{The Independent Cascade Model}

Recall that to cast the Independent Cascade model as a Generalized Linear Cascade, we performed the following change of variables: $\forall i,j \ \Theta_{i,j} = \log(1 - p_{i,j})$. The previous results hold \emph{a priori} for the ``effective'' parameter $\Theta$. The following lemma shows they also hold for the original infection probabilities $p_{i,j}$:
\begin{lemma}
\label{lem:theta_p_upperbound}
$\|\theta - \theta' \|_2 \geq \|p - p^*\|_2$ and $\|\theta_{\lfloor s \rfloor}\|_1 \geq \| p_{\lfloor s \rfloor} \|_1$
\end{lemma}
\begin{proof}
Using the inequality $\forall x>0, \; \log x \geq 1 - \frac{1}{x}$, we have $|\log (1 - p) - \log (1-p')| \geq \max(1 - \frac{1-p}{1-p'}, 1 - \frac{1-p'}{1-p}) \geq \max( p-p', p'-p)$. The second result follows easily from the monotonicity of $\log$ and $\forall x > 0, | \log (1 -x) | \geq x$
\end{proof}

The results of sections~\ref{sec:main_theorem} and \ref{sec:relaxing_sparsity} therefore hold for the original transition probabilities $p$.

\begin{comment}
\begin{lemma}
Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in \mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where $\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar {\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that  ${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0, \; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle \geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$. Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where ${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda - \theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) + \frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal R}(\theta^*_{{\cal M}^\perp}\}$$
\end{lemma}
\end{comment}