1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
|
In this appendix, we provide the missing proofs of Section~\ref{sec:results}
and Section~\ref{sec:lowerbound}. We also show additional experiments on the
running time of our recovery algorithm which could not fit in the main part of
the paper.
\subsection{Proofs of Section~\ref{sec:results}}
\paragraph{Bounded gradient} The proof of Lemma~\ref{lem:ub} giving an upper
bound on the $\ell_{\infty}$ norm of the gradient of the likelihood function is
given below.
\begin{proof}
The gradient of $\mathcal{L}$ is given by:
\begin{multline*}
\nabla \mathcal{L}(\theta^*) =
\frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\
- (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg]
\end{multline*}
Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of
$\nabla\mathcal{L}(\theta^*)$. Writing
$\partial_j\mathcal{L}(\theta^*)
= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
$\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.
Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq
\frac{1}{\alpha}$ and we can apply Azuma's inequality to $Z_t$:
\begin{displaymath}
\P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
2\exp\left(\frac{-\lambda^2\alpha}{2n}\right)
\end{displaymath}
Applying a union bound to have the previous inequality hold for all coordinates
of $\nabla\mathcal{L}(\theta)$ implies:
\begin{align*}
\P\big[\|\nabla\mathcal{L}(\theta^*)\|_{\infty}\geq \lambda \big]
&\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right)
\end{align*}
Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes
the proof.
\end{proof}
%\subsubsection{Approximate sparsity proof}
\paragraph{(RE) with high probability} We now prove Proposition~\ref{prop:fi}.
The proof mostly relies on showing that the Hessian of likelihood function
$\mathcal{L}$ is sufficiently well concentrated around its expectation.
\begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if
$ \forall\Delta\in C(S),\;
\|\E[H] - H]\|_\infty\leq \lambda $
and $\E[H]$ verifies the $(S,\gamma)$-(RE)
condition then:
\begin{equation}
\label{eq:foo}
\forall \Delta\in C(S),\;
\Delta H\Delta \geq
\Delta \E[H]\Delta(1-32s\lambda/\gamma)
\end{equation}
Indeed, $
|\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq
2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2
$.
Writing
$\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in
T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq
\frac{3}{\alpha}$.
Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this
implies:
\begin{displaymath}
\P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq
2\exp\left(-\frac{n\alpha\lambda^2}{3} + 2\log m\right)
\end{displaymath}
Thus, if we take $\lambda=\sqrt{\frac{9log m}{\alpha
n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least
$1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq
\frac{1}{28\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies
$
\forall \Delta\in C(S),\;
\Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta,
$ w.p.~at least $1-e^{-n^{\delta}\log m}$ and the conclusion of
Proposition~\ref{prop:fi} follows.
\end{proof}
\subsection{Proof of Theorem~\ref{thm:lb}}
Let us consider an algorithm $\mathcal{A}$ which verifies the recovery
guarantee of Theorem~\ref{thm:lb}: there exists a probability distribution over
measurements such that for all vectors $\theta^*$, \eqref{eq:lb} holds w.p.
$\delta$. This implies by the probabilistic method that for all distribution
$D$ over vectors $\theta$, there exists an $n\times m$ measurement matrix $X_D$
with such that \eqref{eq:lb} holds w.p. $\delta$ ($\theta$ is now the random
variable).
Consider the following distribution $D$: choose $S$ uniformly at random from a
``well-chosen'' set of $s$-sparse supports $\mathcal{F}$ and $t$ uniformly at
random from
$X \defeq\big\{t\in\{-1,0,1\}^m\,|\, \mathrm{supp}(t)\in\mathcal{F}\big\}$.
Define $\theta = t + w$ where
$w\sim\mathcal{N}(0, \alpha\frac{s}{m}I_m)$ and $\alpha = \Omega(\frac{1}{C})$.
Consider the following communication game between Alice and Bob: \emph{(1)}
Alice sends $y\in\reals^m$ drawn from a Bernouilli distribution of parameter
$f(X_D\theta)$ to Bob. \emph{(2)} Bob uses $\mathcal{A}$ to recover
$\hat{\theta}$ from $y$. It can be shown that at the end of the game Bob now
has a quantity of information $\Omega(s\log \frac{m}{s})$ about $S$. By the
Shannon-Hartley theorem, this information is also upper-bounded by $\O(n\log
C)$. These two bounds together imply the theorem.
\subsection{Running Time Analysis}
\begin{figure}
\centering
\includegraphics[scale=.4]{figures/n_nodes_running_time.pdf}
\caption{Running time analysis for estimating the parents of a \emph{single
node} on a Barabasi-Albert graph as a function of the number of nodes in
the graph. The parameter $k$ (number of nodes each new node is attached to)
was set to $30$. $p_{\text{init}}$ is chosen equal to $.15$, and
the edge weights are chosen uniformly at random in $[.2,.7]$. The
penalization parameter $\lambda$ is chosen equal to $.1$.}
\label{fig:running_time_n_nodes}
\end{figure}
\begin{figure}
\centering
\includegraphics[scale=.4]{figures/n_cascades_running_time.pdf}
\caption{Running time analysis for estimating the parents of a \emph{single
node} on a Barabasi-Albert graph as a function of the number of total
observed cascades. The parameters defining the graph were set as in
Figure~\ref{fig:running_time_n_nodes}.}
\label{fig:running_time_n_cascades}
\end{figure}
We include here a running time analysis of our algorithm. In
Figure~\ref{fig:running_time_n_nodes}, we compared our algorithm to the
benchmarks for increasing values of the number of nodes. In
Figure~\ref{fig:running_time_n_cascades}, we compared our algorithm to the
benchmarks for a fixed graph but for increasing number of observed cascades.
In both Figures, unsurprisingly, the simple greedy algorithm is the fastest.
Even though both the MLE algorithm and the algorithm we introduced are based on
convex optimization, the MLE algorithm is faster. This is due to the overhead
caused by the $\ell_1$-regularisation in~\eqref{eq:pre-mle}.
The dependency as the number of cascades increases is linear, as expected. The
slope is largest for our algorithm, which is against caused by the overhead
induced by the $\ell_1$-regularization.
%\subsection{Other continuous time processes binned to ours: proportional
%hazards model}
%\subsection{Irrepresentability vs. Restricted Eigenvalue Condition}
%In the words and notation of Theorem 9.1 in \cite{vandegeer:2009}:
%\begin{lemma}
%\label{lemm:irrepresentability_proof}
%Let $\phi^2_{\text{compatible}}(L,S) \defeq \min \{ \frac{s
%\|f_\beta\|^2_2}{\|\beta_S\|^2_1} \ : \ \beta \in {\cal R}(L, S) \}$, where
%$\|f_\beta\|^2_2 \defeq \{ \beta^T \Sigma \beta \}$ and ${\cal R}(L,S) \defeq
%\{\beta : \|\beta_{S^c}\|_1 \leq L \|\beta_S\|_1 \neq 0\}$. If
%$\nu_{\text{irrepresentable}(S,s)} < 1/L$, then $\phi^2_{\text{compatible}}(L,S)
%\geq (1 - L \nu_{\text{irrepresentable}(S,s)})^2 \lambda_{\min}^2$.
%\end{lemma}
%Since ${\cal R}(3, S) = {\cal C}$, $\|\beta_S\|_1 \geq \|\beta_S\|_2$, and
%$\|\beta_S\|_1 \geq \frac{1}{3} \|\beta_{S^c}\|_1$ it is easy to see that
%$\|\beta_S\|_1 \geq \frac{1}{4} \|\beta\|_2$ and therefore that: $\gamma_n \geq
%\frac{n}{4s}\phi^2_{\text{compatible}}(3,S)$
%Consequently, if $\epsilon > \frac{2}{3}$, then
%$\nu_{\text{irrepresentable}(S,s)} < 1/3$ and the conditions of
%Lemma~\ref{lemm:irrepresentability_proof} hold.
%\subsection{Lower bound for restricted eigenvalues (expected hessian) for
%different graphs}
%\subsection{Better asymptotic w.r.t expected hessian}
%\subsection{Confidence intervals?}
%\subsection{Active learning}
|