paper/sections/results.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316

In this section, we apply the sparse recovery framework to analyze under which
assumptions our program~\eqref{eq:pre-mle} recovers the true parameter
$\theta_i$ of the cascade model. Furthermore, if we can estimate $\theta_i$ to
a sufficiently good accuracy, it is then possible to recover the support of
$\theta_i$ by simple thresholding, which provides a solution to the standard
Graph Inference problem.

We will first give results in the exactly sparse setting in which $\theta_i$
has a support of size exactly $s$. We will then relax this sparsity constraint
and give results in the \emph{stable recovery} setting where $\theta_i$ is
approximately $s$-sparse.

As mentioned in Section~\ref{sec:mle}, the maximum likelihood estimation
program is decomposable. We will henceforth focus on a single node $i\in V$ and
omit the subscript $i$ in the notations when there is no ambiguity. The
recovery problem is now the one of estimating a single vector $\theta^*$ from
a set $\mathcal{T}$ of observations. We will write $n\defeq |\mathcal{T}|$.

\subsection{Main Theorem}
\label{sec:main_theorem}

In this section, we analyze the case where $\theta^*$ is exactly sparse. We
write $S\defeq\text{supp}(\theta^*)$ and $s=|S|$. Our main theorem will rely
on the now standard \emph{restricted eigenvalue condition}.

\begin{definition}
    Let $\Sigma\in\mathcal{S}_m(\reals)$ be a real symmetric matrix and $S$ be
    a subset of $\{1,\ldots,m\}$. Defining $\mathcal{C}(S)\defeq
    \{X\in\reals^m\,:\,\|X\|_1\leq 1\text{ and } \|X_{S^c}\|_1\leq
    3\|X_S\|_1\}$. We say that $\Sigma$ satisfies the
    $(S,\gamma)$-\emph{restricted eigenvalue condition} iff:
\begin{equation}
    \forall X \in {\cal C(S)}, \| \Sigma X \|_2^2 \geq \gamma \|X\|_2^2 
\tag{RE}
\label{eq:re}
\end{equation}
\end{definition}

A discussion of this assumption in the context of generalized linear cascade
models can be found in Section~\ref{sec:re}.

We will also need the following assumption on the inverse link function $f$ of
the generalized linear cascade model:
\begin{equation}
    \tag{LF}
    \max\left(\left|\frac{f'}{f}(\inprod{\theta^*}{x})\right|,
    \left|\frac{f'}{1-f}(\inprod{\theta^*}{x})\right|\right)\leq
    \frac{1}{\alpha}
\end{equation}
for some $\alpha\in(0, 1)$ and for all $x\in\reals^m$ such that
$f(\inprod{\theta^*}{x})\notin\{0,1\}$.

In the voter model, $\frac{f'}{f}(z) = \frac{1}{z}$ and $\frac{f'}{f}(1-z)
= \frac{1}{1-z}$. Hence (LF) will hold as soon as $\alpha\leq \Theta_{i,j}\leq
1-\alpha$ for all $(i,j)\in E$. Similarly, in the Independent Cascade Model,
$\frac{f'}{f}(z) = \frac{1}{1-e^z}$ and $\frac{f'}{1-f}(z) = -1$ and (LF) holds
if $p_{i, j}\leq 1-\alpha$ for all $(i, j)\in E$. Remember that in this case we
have $\Theta_{i,j} = \log(1-p_{i,j})$.

\begin{theorem}
\label{thm:main}
Assume the Hessian $\nabla^2\mathcal{L}(\theta^*)$ satisfies the
$(S,\gamma)$-{\bf(RE)} for some $\gamma > 0$ and that {\bf (LF)} holds for
some $\alpha > 0$. For any $\delta\in(0,1)$, let $\hat{\theta}$ be the solution
of \eqref{eq:pre-mle} with $\lambda \defeq 2\sqrt{\frac{\log m}{\alpha n^{1
- \delta}}}$ then:
\begin{equation}
    \label{eq:sparse}
    \|\hat \theta - \theta^* \|_2
    \leq \frac{3}{\gamma} \sqrt{\frac{s \log m}{\alpha n^{1-\delta}}}
\quad
\text{w.p.}\;1-\frac{1}{e^{n^\delta \log m}}
\end{equation}
\end{theorem}

Before moving to the proof of Theorem~\ref{thm:main}, note that interpreting it
in the case of the Independent Cascade Model requires one more step. Indeed, to
cast it as a generalized linear cascade model, we had to perform the
transformation $\Theta_{i,j} = \log(1-p_{i,j})$, where $p_{i,j}$ are the
infection probabilities. Fortunately, if we estimate $p_j$ through
$\hat{p}_{j} = \hat{\theta}_j$, a bound on $\|\hat\theta - \theta^*\|_2$
directly implies a bound on $\|\hat p - p^*\|$. Indeed we have:

\begin{lemma}
    $\|\hat{\theta} - \theta' \|_2 \geq \|\hat{p} - p^*\|_2$.
\end{lemma}
\begin{proof}
Using the inequality $\forall x>0, \; \log x \geq 1 - \frac{1}{x}$, we have
$|\log (1 - p) - \log (1-p')| \geq \max(1 - \frac{1-p}{1-p'},
1 - \frac{1-p'}{1-p}) \geq \max( p-p', p'-p)$.
\end{proof}

Theorem~\ref{thm:main} is a consequence of Theorem~1 in \cite{Negahban:2009}
which gives a bound on the convergence rate of regularized estimators. We state
their theorem in the context of $\ell_1$ regularization in
Lemma~\ref{lem:negahban}.

\begin{lemma}
    \label{lem:negahban}
Let ${\cal C}(S) \defeq \{ \Delta \in \mathbb{R}^m\,|\,\|\Delta_S\|_1 \leq
3 \|\Delta_{S^c}\|_1 \}$. Suppose that:
\begin{multline}
    \label{eq:rc}
    \forall \Delta \in {\cal C}(S), \;
    {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*)\\
    - \inprod{\Delta {\cal L}(\theta^*)}{\Delta}
    \geq \kappa_{\cal L} \|\Delta\|_2^2 - \tau_{\cal L}^2(\theta^*)
\end{multline}
for some $\kappa_{\cal L} > 0$ and function $\tau_{\cal L}$. Finally suppose
that $\lambda \geq 2 \|\nabla {\cal L}(\theta^*)\|_{\infty}$, then if
$\hat{\theta}_\lambda$ is the solution of \eqref{eq:pre-mle}:
\begin{displaymath}
\|\hat \theta_\lambda - \theta^* \|_2^2
\leq 9 \frac{\lambda^2 s}{\kappa_{\cal L}}
+ \frac{\lambda}{\kappa_{\cal L}^2} 2 \tau^2_{\cal L}(\theta^*)
\end{displaymath}
\end{lemma}

To prove Theorem~\ref{thm:main}, we apply Lemma~\ref{lem:negahban} with
$\tau_{\mathcal{L}}=0$. Since $\mathcal{L}$ is twice differentiable and convex,
assumption \eqref{eq:rc} is implied by the restricted eigenvalue condition
\eqref{eq:re}. The upper bound on the $\ell_{\infty}$ norm of
$\nabla\mathcal{L}(\theta^*)$ is given by Lemma~\ref{lem:ub}.

\begin{lemma}
\label{lem:ub}
Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$:
\begin{displaymath}
    \|\nabla {\cal L}(\theta^*)\|_{\infty}
    \leq 2 \sqrt{\frac{\log m}{\alpha n^{1 - \delta}}}
    \quad
    \text{w.p.}\; 1-\frac{1}{e^{n^\delta \log m}}
\end{displaymath}
\end{lemma}

\begin{proof}
The gradient of $\mathcal{L}$ is given by:
\begin{multline*}
    \nabla \mathcal{L}(\theta^*) =
    \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
        x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\
    - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg]
\end{multline*}

Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of
$\nabla\mathcal{L}(\theta^*)$.  Writing:
$\partial_j\mathcal{L}(\theta^*)
= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
$\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.

Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq \frac{1}{\alpha}$
and we can apply Azuma's inequality to $Z_t$:
\begin{displaymath}
    \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
    2\exp\left(\frac{-\lambda^2\alpha}{2n}\right)
\end{displaymath}

Applying a union bound to have the previous inequality hold for all coordinates
of $\nabla\mathcal{L}(\theta)$ implies:
\begin{align*}
    \P\big[\|\nabla\mathcal{L}(\theta)\|_{\infty}\geq \lambda \big]
    &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right)
\end{align*}
Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes the
proof.
\end{proof}

We now show how to use Theorem~\ref{thm:main} to recover the support of
$\theta^*$, that is, to solve the Graph Inference problem.

\begin{corollary}
\label{cor:variable_selection}
Under the same assumptions of Theorem~\ref{thm:main}, let $\hat {\cal S}_\eta
\defeq \{ j \in \{1,\ldots, m\} : \hat{\theta}_j > \eta\}$ for $\eta > 0$. For
$\epsilon>0$ and $\epsilon < \eta$, let ${\cal S}^*_{\eta + \epsilon} \defeq \{
i \in \{1,\ldots,m\} :\theta_i > \eta +\epsilon \}$ be the set of all true `strong'
parents. Suppose the number of measurements verifies:
$ n > \frac{9}{\alpha}\frac{\gamma^2}{\epsilon^2} s \log m $.
Then with probability $1-\frac{1}{m}$, ${\cal S}^*_{\eta + \epsilon} \subseteq
\hat {\cal S}_\eta \subseteq {\cal S}^*$. In other words we recover all `strong'
parents and no `false' parents. 
\end{corollary}

\begin{proof}
By choosing $\delta = 0$, if $n>\frac{9}{\alpha}\frac{\gamma^2}{\epsilon^2} s \log m$,
then $\|\hat \theta-\theta\|_2 < \epsilon < \eta$ with probability
$1-\frac{1}{m}$. If $\theta_i = 0$ and $\hat \theta > \eta$, then $\|\hat
\theta - \theta\|_2 \geq |\hat \theta_i-\theta_j| > \eta$, which is
a contradiction. Therefore we get no false positives. If $\theta_i \leq \eta
+ \epsilon$, then $|\hat{\theta}_i- \theta_i| < \epsilon \implies
\theta_j > \eta$. Therefore, we get all strong parents.
\end{proof}

Assuming we know a lower bound $\alpha$ on $\Theta_{i,j}$,
Corollary~\ref{cor:variable_selection} can be applied to the Graph Inference
problem in the following manner: pick $\epsilon = \frac{\eta}{2}$ and $\eta
= \frac{\alpha}{3}$, then $S_{\eta+\epsilon}^* = S$ provided that
$n=\Omega\left(\frac{\gamma^2}{\alpha^3}s\log m\right)$. That is, the support
of $\theta^*$ can be found by thresholding $\hat{\theta}$ to the level $\eta$.

\subsection{Relaxing the Sparsity Constraint}
\label{sec:relaxing_sparsity}

In practice, exact sparsity is rarely verified. For social networks in particular, it is more realistic to assume that each node has few strong `parents' and many `weak' parents. Rather than obtaining an impossibility result in this situation, we show that we pay a small price for relaxing the sparsity constraint. If we let $\theta^*_{\lfloor s \rfloor}$ be the best s-sparse approximation to $\theta^*$ defined as 
$$\theta^*_{\lfloor s \rfloor} \defeq \min_{\|\theta\|_0 \leq s} \|\theta - \theta^*\|_1$$
then we pay a price proportional to $\|\theta^*_s\|_1$ for recovering the weights of non-exactly sparse vectors, i.e. the sum of the coefficients of $\theta$ save for the $s$ largest. In other words, the closer $\theta^*$ is to being sparse, the smaller the price. These results are formalized in the following theorem, which is also a consequence of Theorem 1 in \cite{Negahban:2009} and lemma~\ref{lem:icc_lambda_upper_bound}.

\begin{theorem}
\label{thm:approx_sparse}
Let $\theta^*_{\lfloor s \rfloor}$ be the best s-sparse approximation to the true vector $\theta^*$. Suppose the {\bf(RE)} assumption holds for the Hessian $\nabla^2 f(\theta^*)$ and for the following set 
\begin{align}
\nonumber
{\cal C}' \defeq & \{X \in \mathbb{R}^p : \|X_{S^c}\|_1 \leq 3 \|X_S\|_1 + 4 \|\theta^*_{\lfloor s \rfloor}\|_1 \} \\ \nonumber
& \cap \{ \|X\|_1 \leq 1 \}
\end{align}
By solving \eqref{eq:pre-mle} for $\lambda \defeq 2\sqrt{\frac{\log m}{n^{1 - \delta} p_{\min}}}$ we have:
\begin{align}
\|\hat p - p^* \|_2 \leq 3 \frac{\sqrt{s}\lambda_n}{\gamma_n} + 2 \sqrt{\frac{\lambda_n}{\gamma_n}} \|p^*_{\lfloor s \rfloor}\|_1
\end{align}
\end{theorem}

As before, edge recovery is a consequence of upper-bounding $\|\theta - \hat \theta\|_2$

\begin{corollary}
If $\theta$ is not exactly s-sparse and the number of measurements verifies:
\begin{equation}
n > \frac{36 \| p^*_{\lfloor s\rfloor}\|_1}{p_{\min}\gamma^2 \epsilon^4} s \log m
\end{equation}
then similarly: ${\cal S}^*_{\eta + \epsilon} \subset \hat {\cal S}_\eta \subset {\cal S}^*$ w.h.p.
\end{corollary}


\subsection{Restricted Eigenvalue Condition}
\label{sec:re}

In this section, we discuss the main assumption of Theorem~\ref{thm:main}
namely the restricted eigenvalue condition. We then compare to the
irrepresentability condition considered in \cite{Daneshmand:2014}.

\paragraph{Interpretation}

The restricted eigenvalue condition, introduced in \cite{bickel:2009}, is one
of the weakest sufficient condition on the design matrix for successful sparse
recovery \cite{vandegeer:2009}. Several recent papers show that large classes
of correlated designs obey the restricted eigenvalue property with high
probability \cite{raskutti:10} \cite{rudelson:13}. 


\paragraph{(RE) with high probability}

Expressing the minimum restricted eigenvalue $\gamma$ as a function of the
cascade model parameters is highly non-trivial. Yet, the restricted eigenvalue
property is however well behaved in the following sense: under reasonable
assumptions, if the population matrix of the hessian $\mathbb{E} \left[\nabla^2
{\cal L}(\theta) \right]$, corresponding to the \emph{Fisher Information
Matrix} of the Cascade Model as a function of $\Theta$, verifies the restricted
eigenvalue property, then the finite sample hessian also verifies the
restricted eigenvalue property with overwhelming probability. It is
straightforward to show this holds when $n \geq C s^2 \log m$
\cite{vandegeer:2009}, where $C$ is an absolute constant. By adapting Theorem
8 \cite{rudelson:13}\footnote{this result still needs to be confirmed!}, this
  can be reduced to:
\begin{displaymath}
n \geq C s \log m \log^3 \left( \frac{s \log m}{C'} \right)
\end{displaymath}
where $C, C'$ are constants not depending on $(s, m, n)$.

\paragraph{(RE) vs Irrepresentability Condition}

\cite{Daneshmand:2014} rely on an `incoherence' condition on the hessian of the likelihood function. It is in fact easy to see that their condition is equivalent to the more commonly called {\it (S,s)-irrepresentability} condition:

\begin{definition}
Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and $S^c$ be the set of indices of all the parents and non-parents respectively and $Q_{S,S}$, $Q_{S^c,S}$, $Q_{S, S^c}$, and $Q_{S^c, S^c}$ the induced sub-matrices. Consider the following constant:
\begin{equation}
\nu_{\text{irrepresentable}}(S) \defeq \max_{\tau \in \mathbb{R}^p \ :\ \| \tau \|_{\infty} \leq 1} \|Q_{S^c, S} Q_{S, S}^{-1} \tau\|_{\infty}
\end{equation}
The (S,s)-irrepresentability holds if $\nu_{\text{irrepresentable}}(S) < 1 - \epsilon$ for $\epsilon > 0$
\end{definition}

This condition has been shown to be essentially necessary for variable selection \cite{Zhao:2006}. Several recent papers \cite{vandegeer:2011}, \cite{Zou:2006}, argue this condition is unrealistic in situations where there is a correlation between variables. Consider the following simplified example from \cite{vandegeer:2011}:
\begin{equation}
\nonumber
\left( 
\begin{array}{cccc}
I_s & \rho J \\
\rho J & I_s \\
\end{array}
\right)
\end{equation}
where $I_s$ is the $s \times s$ identity matrix, $J$ is the all-ones matrix and $\rho \in \mathbb{R}^+$. It is easy to see that $\nu_{\text{irrepresentable}}(S) = \rho s$ and $\lambda_{\min}(Q) \geq 1 - \rho$, such that for any $\rho > \frac{1}{s}$ and $\rho < 1$, the restricted eigenvalue holds trivially but the (S,s)-irrepresentability does not hold.

It is intuitive that the irrepresentability condition is stronger than the {\bf(RE)} assumption. In fact, a slightly modified result from \cite{vandegeer:2009} shows that a `strong' irrepresentability condition directly {\it implies} the {\bf(RE)} condition for $\ell2$-recovery:

\begin{proposition}
\label{prop:irrepresentability}
If the irrepresentability condition holds with $\epsilon > \frac{2}{3}$, then the restricted eigenvalue condition holds with constant  $\gamma_n \geq \frac{ (1 - 3(1 -\epsilon))^2 \lambda_{\min}^2}{4s}n$, where $\lambda_{\min} > 0$ is the smallest eigenvalue of $Q^*_{S,S}$, on which the results of \cite{Daneshmand:2014} also depend.
\end{proposition}

\begin{comment}
\begin{lemma}
Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in \mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where $\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar {\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that  ${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0, \; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle \geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$. Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where ${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda - \theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) + \frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal R}(\theta^*_{{\cal M}^\perp}\}$$
\end{lemma}

\subsection{The Independent Cascade Model}

Recall that to cast the Independent Cascade model as a Generalized Linear
Cascade, we performed the following change of variables: $\forall i,j
\ \Theta_{i,j} = \log(1 - p_{i,j})$. The previous results hold \emph{a priori}
for the ``effective'' parameter $\Theta$. The following lemma shows they also
hold for the original infection probabilities $p_{i,j}$:

The results of sections~\ref{sec:main_theorem} and \ref{sec:relaxing_sparsity}
therefore hold for the original transition probabilities $p$.
\end{comment}