paper/sections/results.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362

In this section, we apply the sparse recovery framework to analyze under which
assumptions our program~\eqref{eq:pre-mle} recovers the true parameter
$\theta_i$ of the cascade model. Furthermore, if we can estimate $\theta_i$ to
a sufficiently good accuracy, it is then possible to recover the support of
$\theta_i$ by simple thresholding, which provides a solution to the standard
Network Inference problem.

We will first give results in the exactly sparse setting in which $\theta_i$
has a support of size exactly $s$. We will then relax this sparsity constraint
and give results in the \emph{stable recovery} setting where $\theta_i$ is
approximately $s$-sparse.

As mentioned in Section~\ref{sec:mle}, the maximum likelihood estimation
program is decomposable. We will henceforth focus on a single node $i\in V$ and
omit the subscript $i$ in the notations when there is no ambiguity. The
recovery problem is now the one of estimating a single vector $\theta^*$ from
a set $\mathcal{T}$ of observations. We will write $n\defeq |\mathcal{T}|$.

\subsection{Main Theorem}
\label{sec:main_theorem}

In this section, we analyze the case where $\theta^*$ is exactly sparse. We
write $S\defeq\text{supp}(\theta^*)$ and $s=|S|$. Recall, that $\theta_i$ is the
vector of weights for all edges \emph{directed at} the node we are solving for.
In other words, $S$ is the set of all nodes susceptible to influence node $i$,
also referred to as its parents. Our main theorem will rely on the now standard
\emph{restricted eigenvalue condition} introduced
by~\cite{bickel2009simultaneous}.

\begin{definition}
    Let $\Sigma\in\mathcal{S}_m(\reals)$ be a real symmetric matrix and $S$ be
    a subset of $\{1,\ldots,m\}$. Defining $\mathcal{C}(S)\defeq
    \{X\in\reals^m\,:\,\|X\|_1\leq 1\text{ and } \|X_{S^c}\|_1\leq
    3\|X_S\|_1\}$. We say that $\Sigma$ satisfies the
    $(S,\gamma)$-\emph{restricted eigenvalue condition} iff:
\begin{equation}
    \forall X \in {\cal C(S)}, X^T \Sigma X \geq \gamma \|X\|_2^2
\tag{RE}
\label{eq:re}
\end{equation}
\end{definition}

A discussion of the $(S,\gamma)$-{\bf(RE)} assumption in the context of
generalized linear cascade models can be found in Section~\ref{sec:re}. In our
setting we require that the {\bf(RE)}-condition holds for the Hessian of the
log-likelihood function $\mathcal{L}$: it essentially captures the fact that
the binary vectors of the set of active nodes (\emph{i.e} the measurements) are
not \emph{too} collinear.


\begin{comment}
Remember that in this case we have $\Theta_{i,j} = \log(1-p_{i,j})$. These
assumptions are reasonable: if an edge has a weight very close to 0, then the
``infection'' will never happen along that edge for our set of observations and
we can never hope to recover it.
\end{comment}

\begin{theorem}
\label{thm:main}
Assume the Hessian $\nabla^2\mathcal{L}(\theta^*)$ satisfies the
$(S,\gamma)$-{\bf(RE)} for some $\gamma > 0$ and that {\bf (LF)} holds for some
$\alpha > 0$. For any $\delta\in(0,1)$, let $\hat{\theta}$ be the solution of
\eqref{eq:pre-mle} with $\lambda \defeq 2\sqrt{\frac{\log m}{\alpha n^{1
- \delta}}}$, then:
\begin{equation}
    \label{eq:sparse}
    \|\hat \theta - \theta^* \|_2
    \leq \frac{6}{\gamma} \sqrt{\frac{s \log m}{\alpha n^{1-\delta}}}
\quad
\text{w.p.}\;1-\frac{1}{e^{n^\delta \log m}}
\end{equation}
\end{theorem}

Note that we have expressed the convergence rate in the number of measurements
$n$, which is different from the number of cascades. For example, in the case
of the voter model with horizon time $T$ and for $N$ cascades, we can expect
a number of measurements proportional to $N\times T$.

Theorem~\ref{thm:main} is a consequence of Theorem~1 in \cite{Negahban:2009}
which gives a bound on the convergence rate of regularized estimators. We state
their theorem in the context of $\ell_1$ regularization in
Lemma~\ref{lem:negahban}.

\begin{lemma} \label{lem:negahban}
Let ${\cal C}(S) \defeq \{ \Delta \in \mathbb{R}^m\,|\,\|\Delta_S\|_1 \leq
3 \|\Delta_{S^c}\|_1 \}$. Suppose that:
\begin{multline}
    \label{eq:rc}
    \forall \Delta \in {\cal C}(S), \;
    {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*)\\
    - \inprod{\nabla {\cal L}(\theta^*)}{\Delta}
    \geq \kappa_{\cal L} \|\Delta\|_2^2 - \tau_{\cal L}^2(\theta^*)
\end{multline}
for some $\kappa_{\cal L} > 0$ and function $\tau_{\cal L}$. Finally suppose
that $\lambda \geq 2 \|\nabla {\cal L}(\theta^*)\|_{\infty}$, then if
$\hat{\theta}_\lambda$ is the solution of \eqref{eq:pre-mle}:
\begin{displaymath}
\|\hat \theta_\lambda - \theta^* \|_2^2
\leq 9 \frac{\lambda^2 s}{\kappa_{\cal L}}
+ \frac{\lambda}{\kappa_{\cal L}^2} 2 \tau^2_{\cal L}(\theta^*)
\end{displaymath}
\end{lemma}

To prove Theorem~\ref{thm:main}, we apply Lemma~\ref{lem:negahban} with
$\tau_{\mathcal{L}}=0$. Since $\mathcal{L}$ is twice differentiable and convex,
assumption \eqref{eq:rc} with $\kappa_{\mathcal{L}}=\frac{\gamma}{2}$ is implied
by the (RE)-condition. For a good convergence rate, we must find the smallest
possible value of $\lambda$ such that $\lambda \geq 2
\|\nabla\mathcal{L}\theta^*\|_{\infty}$.  The upper bound on the $\ell_{\infty}$
norm of $\nabla\mathcal{L}(\theta^*)$ is given by Lemma~\ref{lem:ub}.

\begin{lemma}
\label{lem:ub}
Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$:
\begin{displaymath}
    \|\nabla {\cal L}(\theta^*)\|_{\infty}
    \leq 2 \sqrt{\frac{\log m}{\alpha n^{1 - \delta}}}
    \quad
    \text{w.p.}\; 1-\frac{1}{e^{n^\delta \log m}}
\end{displaymath}
\end{lemma}

The proof of Lemma~\ref{lem:ub} relies crucially on Azuma-Hoeffding's
inequality, which allows us to handle correlated observations. This departs from
the usual assumptions made in sparse recovery settings, where the sequence of
measurements are assumed to be independent from one another. We now show how
to use Theorem~\ref{thm:main} to recover the support of $\theta^*$, that is, to
solve the Network Inference problem.

\begin{corollary}
\label{cor:variable_selection}
Under the same assumptions as Theorem~\ref{thm:main}, let $\hat {\cal S}_\eta
\defeq \{ j \in \{1,\ldots, m\} : \hat{\theta}_j > \eta\}$ for $\eta > 0$. For
$0< \epsilon < \eta$, let ${\cal S}^*_{\eta + \epsilon} \defeq \{ i \in
\{1,\ldots,m\} :\theta_i^* > \eta +\epsilon \}$ be the set of all true `strong'
parents. Suppose the number of measurements verifies: $ n > \frac{9s\log
m}{\alpha\gamma^2\epsilon^2}$.  Then with probability $1-\frac{1}{m}$, ${\cal
S}^*_{\eta + \epsilon} \subseteq \hat {\cal S}_\eta \subseteq {\cal S}^*$. In
other words we recover all `strong' parents and no `false' parents.
\end{corollary}

Assuming we know a lower bound $\alpha$ on $\Theta_{i,j}$,
Corollary~\ref{cor:variable_selection} can be applied to the Network Inference
problem in the following manner: pick $\epsilon = \frac{\eta}{2}$ and $\eta
= \frac{\alpha}{3}$, then $S_{\eta+\epsilon}^* = S$ provided that
$n=\Omega\left(\frac{s\log m}{\alpha^3\gamma^2}\right)$. That is, the support
of $\theta^*$ can be found by thresholding $\hat{\theta}$ to the level $\eta$.

\subsection{Approximate Sparsity}
\label{sec:relaxing_sparsity}

In practice, exact sparsity is rarely verified. For social networks in
particular, it is more realistic to assume that each node has few ``strong''
parents' and many ``weak'' parents. In other words, even if $\theta^*$ is not
exactly $s$-sparse, it can be well approximated by $s$-sparse vectors.

Rather than obtaining an impossibility result, we show that the bounds obtained
in Section~\ref{sec:main_theorem} degrade gracefully in this setting. Formally,
let
$
    \theta^*_{\lfloor s \rfloor}
    \in \argmin_{\|\theta\|_0 \leq s} \|\theta - \theta^*\|_1
    $
be the best $s$-approximation to $\theta^*$. Then we pay a cost proportional
to $\|\theta^* - \theta^*_{\lfloor s\rfloor}\|_1$ for recovering the weights of
non-exactly sparse vectors. This cost is simply the ``tail'' of
$\theta^*$: the sum of the $m-s$ smallest coordinates of $\theta^*$. We recover
the results of Section~\ref{sec:main_theorem} in the limit of exact sparsity.
These results are formalized in the following theorem, which is also a
consequence of Theorem 1 in \cite{Negahban:2009}.
\begin{theorem}
\label{thm:approx_sparse}
Suppose the {\bf(RE)} assumption holds for the Hessian $\nabla^2 f(\theta^*)$
and $\tau_{\mathcal{L}}(\theta^*) = \frac{\kappa_2\log m}{n}\|\theta^*\|_1$ on
the following set:
\begin{align}
\nonumber
{\cal C}' \defeq & \{X \in \mathbb{R}^p : \|X_{S^c}\|_1 \leq 3 \|X_S\|_1
+ 4 \|\theta^* - \theta^*_{\lfloor s \rfloor}\|_1 \} \\ \nonumber
& \cap \{ \|X\|_1 \leq 1 \}
\end{align}
If the number of measurements $n\geq \frac{64\kappa_2}{\gamma}s\log m$, then by
solving \eqref{eq:pre-mle} for $\lambda \defeq 2\sqrt{\frac{\log m}{\alpha n^{1
- \delta}}}$ we have:
\begin{align*}
    \|\hat \theta - \theta^* \|_2 \leq \frac{3}{\gamma} \sqrt{\frac{s\log
    m}{\alpha n^{1-\delta}}} + 4 \sqrt[4]{\frac{s\log m}{\gamma^4\alpha
    n^{1-\delta}}} \|\theta^* - \theta^*_{\lfloor s \rfloor}\|_1
\end{align*}
\end{theorem}

As in Corollary~\ref{cor:variable_selection}, an edge recovery guarantee can be
derived from  Theorem~\ref{thm:approx_sparse} in the case of approximate
sparsity.

\begin{comment}
\begin{corollary}
    Under the same assumptions as Theorem~\ref{thm:approx_sparse}, if the number
    of measurements verifies: \begin{equation}
        n > \frac{9}{\alpha\gamma^2\epsilon^2}\left(1+
        \frac{16}{\epsilon^2}\| \theta^* - \theta^*_{\lfloor
    s\rfloor}\|_1\right)s\log m
\end{equation}
then: ${\cal S}^*_{\eta + \epsilon} \subset \hat {\cal S}_\eta
\subset {\cal S}^*$ w.p. at least $1-\frac{1}{m}$.
\end{corollary}
\end{comment}

\subsection{Restricted Eigenvalue Condition}
\label{sec:re}

There exists a large class of sufficient conditions under which sparse recovery
is achievable in the context of regularized estimation~\cite{vandegeer:2009}.
The restricted eigenvalue condition, introduced in \cite{bickel:2009}, is one
of the weakest such assumption. It can be interpreted as a restricted form of
non-degeneracy. Since we apply it to the Hessian of the log-likelihood function
$\nabla^2 \mathcal{L}(\theta)$, it essentially reduces to a form of restricted
strong convexity, that Lemma~\ref{lem:negahban} ultimately relies on.

Observe that the Hessian of $\mathcal{L}$ can be seen as a re-weighted
\emph{Gram matrix} of the observations:
\begin{multline*}
    \nabla^2\mathcal{L}(\theta^*)
    = \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}}x^t(x^t)^T
    \bigg[x_i^{t+1}\frac{f''f-f'^2}{f^2}(\inprod{\theta^*}{x^t})\\
    -(1-x_i^{t+1})\frac{f''(1-f) + f'^2}{(1-f)^2}(\inprod{\theta^*}{x^t})\bigg]
\end{multline*}
If $f$ and $1-f$ are $c$-strictly log-convex~\cite{bagnoli2005log} for $c>0$,
then $ \min\left((\log f)'', (\log (1-f))'' \right) \geq c $. This implies that
the $(S, \gamma)$-({\bf RE}) condition in Theorem~\ref{thm:main} and
Theorem~\ref{thm:approx_sparse} reduces to a condition on the \emph{Gram
matrix} of the observations $X^T
X = \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}}x^t(x^t)^T$ for $\gamma'
\defeq \gamma\cdot c$.

\paragraph{(RE) with high probability}

The Generalized Linear Cascade model yields a probability distribution over the
observed sets of infected nodes $(x^t)_{t\in\mathcal{T}}$. It is then natural
to ask whether the restricted eigenvalue condition is likely to occur under
this probabilistic model. Several recent papers show that large classes of
correlated designs obey the restricted eigenvalue property with high
probability \cite{raskutti:10, rudelson:13}.

The {\bf(RE)}-condition is well-behaved in the following sense: if it holds for
the expected Hessian matrix $\E[\nabla^2\mathcal{L}(\theta^*)]$, then it holds
for the finite sample Hessian matrix $\nabla^2\mathcal{L}(\theta^*)$ with high
probability.

If $f$ and $1-f$ are strictly log-convex, then the previous observations yields
the following natural interpretation of the {\bf(RE)}-condition: the expected
\emph{Gram matrix} $A \equiv \mathbb{E}[X^T X]$ is a matrix whose entry
$a_{i,j}$ is the average fraction of times node $i$ and node $j$ are infected
at the same time during several runs of the cascade process. Note that the
diagonal terms $a_{i,i}$ are simply the average fraction of times the
corresponding node $i$ is infected.

Therefore, under an assumption which only involves the probabilistic model and
not the actual observations, Proposition~\ref{prop:fi} allows us to draw the
same conclusion as in Theorem~\ref{thm:main}.

\begin{proposition}
    \label{prop:fi}
    Suppose $\E[\nabla^2\mathcal{L}(\theta^*)]$ verifies the $(S,\gamma)$-{\bf
    (RE)} condition and assume {\bf (LF)} and {\bf (LF2)}. For $\delta> 0$, if
    $n^{1-\delta}\geq \frac{1}{28\gamma\alpha}s^2\log m $, then
    $\nabla^2\mathcal{L}(\theta^*)$ verifies the $(S,\frac{\gamma}{2})$-(RE)
    condition, w.p $\geq 1-e^{-n^\delta\log m}$.
\end{proposition}

Observe that the number of measurements required in Proposition~\ref{prop:fi}
is now quadratic in $s$. If we only keep the first measurement from each
cascade, which are independent, we can apply Theorem 1.8 from
\cite{rudelson:13}, lowering the number of required cascades to $s\log m \log^3(
s\log m)$.

%\paragraph{(RE) vs Irrepresentability Condition}

%\cite{Daneshmand:2014} rely on an `incoherence' condition on the hessian of the
%likelihood function also known as the {\it (S,s)-irrepresentability} condition.

%\begin{comment}
%\begin{definition}
%Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and
%$S^c$ be the set of indices of all the parents and non-parents respectively and
%$Q_{S,S}$, $Q_{S^c,S}$, $Q_{S, S^c}$, and $Q_{S^c, S^c}$ the induced
%sub-matrices. Consider the following constant:
%\begin{equation}
%\nu_{\text{irrepresentable}}(S) \defeq \max_{\tau \in \mathbb{R}^p \ :\ \| \tau
%\|_{\infty} \leq 1} \|Q_{S^c, S} Q_{S, S}^{-1} \tau\|_{\infty}
%\end{equation}
%The (S,s)-irrepresentability holds if $\nu_{\text{irrepresentable}}(S) < 1 -
%\epsilon$ for $\epsilon > 0$
%\end{definition}
%\end{comment}

%It is possible to construct examples where the (RE) condition holds but not the
%irrepresentability condition \cite{vandegeer:2009}. Theorem 9.1 from the same
%paper shows that a `strong' irrepresentability condition directly {\it implies}
%the {\bf(RE)} condition for $\ell_2$-recovery.

\begin{comment}
\begin{proposition}
\label{prop:irrepresentability}
If the irrepresentability condition holds with $\epsilon > \frac{2}{3}$, then
the restricted eigenvalue condition holds with constant  $\gamma_n \geq \frac{
(1 - 3(1 -\epsilon))^2 \lambda_{\min}^2}{4s}n$, where $\lambda_{\min} > 0$ is
the smallest eigenvalue of $Q^*_{S,S}$, on which the results of
\cite{Daneshmand:2014} also depend.
\end{proposition}
\end{comment}

%\begin{comment}
%Furthermore, recent papers \cite{vandegeer:2011}, \cite{Zou:2006}, argue that
%the irrepresentability condition is unrealistic in situations where there is
%a correlation between variables. Consider the following simplified example from
%\cite{vandegeer:2011}:
%\begin{equation}
%\nonumber
%\left(
%\begin{array}{cccc}
%I_s & \rho J \\
%\rho J & I_s \\
%\end{array}
%\right)
%\end{equation}
%where $I_s$ is the $s \times s$ identity matrix, $J$ is the all-ones matrix and
%$\rho \in \mathbb{R}^+$. It is easy to see that $\nu_{\text{irrepresentable}}(S)
%= \rho s$ and $\lambda_{\min}(Q) \geq 1 - \rho$, such that for any $\rho >
%\frac{1}{s}$ and $\rho < 1$, the restricted eigenvalue holds trivially but the
%(S,s)-irrepresentability does not hold.


%\begin{lemma}
%Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in
%\mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal
%R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where
%$\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar
%{\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that
%${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0,
%\; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* +
%\Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle
%\geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal
%M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$.
%Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where
%${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda -
%\theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) +
%\frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal
%R}(\theta^*_{{\cal M}^\perp}\}$$
%\end{lemma}

%\subsection{The Independent Cascade Model}

%Recall that to cast the Independent Cascade model as a Generalized Linear
%Cascade, we performed the following change of variables: $\forall i,j
%\ \Theta_{i,j} = \log(1 - p_{i,j})$. The previous results hold \emph{a priori}
%for the ``effective'' parameter $\Theta$. The following lemma shows they also
%hold for the original infection probabilities $p_{i,j}$:

%The results of sections~\ref{sec:main_theorem} and \ref{sec:relaxing_sparsity}
%therefore hold for the original transition probabilities $p$.
%\end{comment}