finale/mid_report.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504

\documentclass[10pt]{article}
\usepackage[utf8x]{inputenc}
\usepackage{amsmath, amssymb, amsthm, microtype}
\usepackage{algpseudocode, algorithm, algorithmicx}
\usepackage[pagebackref=false,breaklinks=true,
            colorlinks=true,citecolor=blue]{hyperref}
\usepackage[capitalize, noabbrev]{cleveref}
\usepackage{graphicx, subfig}
\usepackage{bbm}
\usepackage{fullpage}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbf{R}}
\newcommand{\N}{\mathbf{N}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\eps}{\varepsilon}
\newcommand{\bt}{\boldsymbol{\theta}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\cl}[1]{\text{\textbf{#1}}}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}

\newtheorem{theorem}{Theorem}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\theoremstyle{remark}
\newtheorem*{example}{Example}
\newtheorem*{remark}{Remark}

\title{Network Inference from Cascades}
\author{Thibaut Horel \and Jean Pouget-Abadie}

\begin{document}

\maketitle

\begin{abstract}
    The Network Inference Problem (NIP) is the machine learning challenge of
    recovering the edges and edge weights of an unknown weighted graph from the
    observations of a random contagion process propagating over this graph.
    While estimators for the edge weights with provable convergence guarantees
    have been obtained under various formulations of the NIP, a rigorous
    statistical treatment of the problem is still lacking. In this work, we
    build upon the unified NIP formulation of \cite{pouget} to explore the
    connections between the topological properties of the graph to be learnt
    and the resulting quality of the estimators. Specifically, we analyze which
    properties of the graph render NIP unfeasible or hard, and which properties
    can be exploited to improve the quality of the estimators.
\end{abstract}

\section{Introduction}

A random contagion process propagating over a graph provides valuable
information about the presence and weights of its edges. This fact is exploited
in many real-life applications, including learning how people influence each
other by observing how they communicate or share information in social
networks, learning protein activation graphs in biology or learning correlations
between financial assets.

Formally, this problem can be formulated as a machine learning task where one
specifies a probabilistic contagion model whose parameters are the edge weights
of the unknown graph. Then, given observations drawn from this random process,
one seeks to construct an estimator of the edge weights. This problem is known
in the literature as the Network Inference Problem

A growing body of works has focused on obtaining convergence guarantees for
estimators of the edge weights under various formulations of the Network
Inference Problem. In \cite{pouget}, we noted that many of these formulations
could be interpreted as specific instances of a common \emph{Generalized Linear
Cascade Model} (GLMC) and showed how to apply results from the sparse recovery
literature to obtain close-to-optimal convergence guarantees.

However, in \cite{pouget} as in previous works, the convergence guarantees rely
on many assumptions about the non-degeneracy of the probabilistic model or the
observations. These assumptions are hard to interpret and weaken possible
applications of the obtained results to real-life applications. Furthermore,
previous works focused on obtaining generic convergence guarantees and it is
not obvious that the developed frameworks can be adapted in presence of domain
specific knowledge about the graph to be learned. The goal of the current work
is to address these shortcomings by asking the following question: \emph{how do
    properties of the graph to be learned impact the task of solving the
    Network Inference Problem for this graph?}. Specifically, we wish to
    understand:
\begin{itemize}
    \item which properties of the graph render the Network Inference Problem
        unfeasible or hard to solve. This relates to the question of
        identifiability of the contagion model, and understanding how the
        constants which appear on convergence rate bounds for edge weights
        estimators relate to topological properties of the graph.
    \item how domain specific knowledge about the graph to be learned can be
        exploited to make the Network Inference Problem easier to solve.
        A natural framework for this is a Bayesian one, where the domain
        specific knowledge is encoded as a Bayesian prior. To the best of our
        knowledge, this approach has not been explored beyond MAP estimation
        under a sparse (Laplace) prior in the context of discrete-time models.
\end{itemize}

The organization of the current report is as follows: we conclude the
introduction with a review of related works. The Generalized Linear Cascade
(GLC) model from \cite{pouget} is presented in Section 2. Section 3. explores
the question of identifiablity of the model. Section 4. presents both the
frequentist and Bayesian approach for inference of the GLC model as well as
a novel active learning formulation of the problem. The code used in our
experiments can be found in the Appendix.

\paragraph{Related works.}
The study of edge prediction in graphs has been an active field of research for
over a decade~\cite{Nowell08, Leskovec07, AdarA05}.~\cite{GomezRodriguez:2010}
introduced the {\scshape Netinf} algorithm, which approximates the likelihood of
cascades represented as a continuous process.  The algorithm, relying on
Maximum-Likelihood and convex relaxations was improved in later
work~\cite{gomezbalduzzi:2011}, but is not known to have any theoretical
guarantees beside empirical validation on synthetic networks.

Let us denote by $m$ the number of nodes in the graph and for a given node, we
denote by $s$ its in-degree. In what follows, we call a \emph{recovery
guarantee} an upper bound on the number of observations required (expressed as
a function of $s$ and $m$) to learn the incoming edges of a node to a fixed
arbitrary accuracy level. \cite{Netrapalli:2012} studied the discrete-time
version of the independent cascade model and obtained the first ${\cal O}(s^2
\log m)$ recovery guarantee on general networks, by using unregularized MLE and
making a {\it correlation decay\/} assumption, which limits the number of new
infections at every step.  They also suggest a {\scshape Greedy} algorithm,
with provably good performance in tree graphs, which maintains a counter of
each node's antecedents, i.e. the set of nodes infected at a prior time step to
its infection time. The work of~\cite{Abrahao:13} studies the same
continuous-model framework as \cite{GomezRodriguez:2010} and obtains an ${\cal
O}(s^9 \log^2 s \log m)$ support recovery algorithm, without the
\emph{correlation decay} assumption.  \cite{du2013uncover,Daneshmand:2014}
propose Lasso regularization of MLE for recovering the weights of the graph
under a continuous-time independent cascade model. The work
of~\cite{du2014influence} is slightly orthogonal to ours since they suggest
learning the \emph{influence} function, rather than the parameters of the
network directly.

More recently, the continuous process studied in previous work has been
reformulated as a Hawkes process, with recent papers
\cite{linderman2014discovering, linderman2015scalable, simma2012modeling},
focusing on Expectation-Maximization, and Variational Inference methods to learn
these processes. Because of the discrete nature of the GLC model, we hope to
bring a better understanding to the link between the properties of a graph and
its \emph{learnability}. We wish to further explore the idea of non-product
priors raised in \cite{linderman2014discovering, linderman2015scalable}, since
the experimental validation of their work focused on simple graph priors.
Finally, the \emph{Active Learning} formulation is, to the best of the
authors' knowledge, novel in this context.

\section{Model}

The GLC model is described over a directed graph $G = (V, \Theta)$. Denoting by
$k=|V|$ the number of nodes in the graph, $\Theta\in\R_{+}^{k\times k}$ is the
matrix of edge weights. Note that $\Theta$ implicitly defines the edge set $E$ of
the graph through the following equivalence:
\begin{displaymath}
    (u,v)\in E\Leftrightarrow \Theta_{u,v} > 0,\quad
    (u,v)\in V^2
\end{displaymath}

The time is discretized and indexed by a variable $t\in\N$. The nodes can be in
one of three states: \emph{susceptible}, \emph{infected} or \emph{immune}.
Let us denote by $S_t$ the set of nodes susceptible at the beginning of time
step $t\in\N$ and by $I_t$ the set of nodes who become infected at this time
step. The following dynamics:
\begin{displaymath}
    S_0 = V,\quad S_{t+1} = S_t \setminus I_t
\end{displaymath}
expresses that the nodes infected at a time step are no longer susceptible
starting from the next time step (they become part of the immune nodes).

The dynamics of $I_t$ are described by a random Markovian process. Let us
denote by $x_t$ the indicator vector of $I_t$ at time step $t\in\N$. $x_0$ is
drawn from a \emph{source distribution} $p_s:\{0,1\}^n\to[0,1]$. For $t\geq 1$,
we have:
\begin{equation}
    \label{eq:markov}
    \forall i\in S_t,\quad
    \P\big(x_{i}^{t} = 1\,|\, x^{t-1}\big) = f(\bt_i\cdot x^{t-1})
\end{equation}
where $\bt_i$ is the $i$th column of $\Theta$. The function $f:\R\to[0,1]$ can
be interpreted as the inverse link function of the model. Finally, the
transitions in \cref{eq:markov} occur independently for each $i$. A cascade
continues until no infected nodes remains.

We refer the reader to \cite{pouget} for a more complete description of the
model and examples of common contagion models which can be interpreted as
specific instances of the GLC model.

\section{Identifiability}

It follows from Section 2, that a source distribution $p_s$ and
\cref{eq:markov} together completely specify the distribution $p$ of a cascade
$\mathbf{x} = (x_t)_{t\geq 0}$:
\begin{equation}
    \label{eq:dist}
    p(\bx;\Theta) = p_s(x^0)\prod_{t\geq 1}\prod_{i\in S_t}
    f(\bt_i\cdot x^{t-1})^{x^t_i}\big(1-f(\theta_i\cdot x^{t-1})\big)^{1-x_i^t}
\end{equation}

In what follows, we consider that $p_s$ and $f$ are fixed. Hence the only
parameter of the model is the matrix $\Theta$ of parameters that we wish to
learn. We note that the problem of learning $\Theta$ might be ill-defined in
case of unidentifiability. We recall the definition of identifiability below.

\begin{definition}
    We say that the model $\mathcal{P}
    = \big\{p(\cdot\,;\,\Theta)\,|\,\Theta\in\R_+^{k\times k}\big\}$ is
    identifiable iff:
    \begin{displaymath}
        \forall \Theta,\Theta'\in\R_+^{k\times k},\quad
        p(\cdot\,;\,\Theta) = p(\cdot\, ;\, \Theta') \Rightarrow \Theta = \Theta'
    \end{displaymath}
\end{definition}

For the GLC model, identifiability is not guaranteed. To develop an intuition
about why this might be the case, we present below examples of
unidentifiability because of two obstructions: lack of information and parent
ambiguity.

\begin{figure}
\begin{center}
    \includegraphics[scale=0.8]{example.pdf}
\end{center}
\caption{Example of a graph which can be unidentifiable, both because of lack
    of information or parent ambiguity if the source distribution is chosen
adversarially.}
\label{fig:ident}
\end{figure}

\vspace{.5em}

\begin{example}[Lack of information]
    Let us consider the graph in \cref{fig:ident} and the source distribution
    $p_s$ defined by $p_s\big((0, 1, 0)\big) = 1$. In other words, node $2$ is
    always the only infected node at $t=0$. Then, it is clear that all
    casacades die at $t=1$ at which node 3 becomes infected with probability
    $f(\theta_{2,3})$. This probability does not depend on $\theta_{1,3}$.
    Hence all matrices $\Theta$ compatible with the graph (defining the same
    edge set) and for which $\theta_{2,3}$ is constant yield the same cascade
    distribution according to \cref{eq:dist}.
\end{example}

\vspace{.5em}

\begin{example}[Parent ambiguity]
Let us consider again the graph in \cref{fig:ident} and let us consider the
case where the source distribution is such that $p_s\big((1,1,0)\big) = 1$,
\emph{i.e} nodes 1 and 2 are always jointly infected at time $t=0$. Then it is
clear that cascades will all die at time $t=1$ where node 3 becomes infected
with probability $f(\theta_{1,3} + \theta_{2,3})$. This implies that all
matrices $\Theta$ which are compatible with the graph in \cref{fig:ident}
(defining the same edge set) and for which $\theta_{1,3} + \theta_{2,3} = c$
for some $c\in \R_+$ define the same cascade probability distribution.
\end{example}

\vspace{.5em}

It appears from these examples show that the question of identifiability is
closely related to the question of reachability of nodes from the source. The
following proposition shows that with the right assumptions on the source
distribution we can guarantee identifiability of the model.

We will slightly abuse the notation $p_s$ and for $u\in V$ write:
\begin{displaymath}
    p_s(w) \eqdef \sum_{\bx: \bx_w = 1} p_s(\bx)
\end{displaymath}
and similarly for a set of vertics $S$,  $p_s(S)$ is defined by replacing the
condition $\bx_w = 1$ by $\bx_S = 1$ in the summation condition in the above
equation. Furthermore, we write $u\leadsto v$ if there is a directed path from
$u$ to $v$ in $G$ and $u\leadsto v\leadsto w$ if there is a directed path from
$u$ to $w$ passing through $v$ in $G$.

\begin{proposition}
Under the assumptions:
\begin{enumerate}
    \item $\forall z\in \mathbf{\R_+},\; f(z)< 1$
    \item for all $S\subseteq V$ with $|S|\geq 2$, $p_s(S)<1$.
    \item $\forall (u,v)\in V^2$, there exists $w$ such that $p_s(w)\neq 0$ and
        $w\leadsto u\leadsto v$.
\end{enumerate}
then the GLC model is identifiable.
\end{proposition}

\begin{proof}[Proof sketch] We only give a quick proof sketch here and defer
    the details to the final version of this report. Let us consider $\Theta
    \neq \Theta'$. The goal is to prove that $p(\cdot\,;\,\Theta)\neq
    p(\cdot\,\;\, \Theta')$. Since $\Theta \neq \Theta'$ there exists a pair
    $(u,v)$ such that $\Theta_{u,v} \neq \Theta'_{u,v}$. Using assumption 3. we
    are guaranteed that with non-zero probability there will exist a time-step
    $t$ such that $x^t_u = 1$. We then distinguish two cases:
    \begin{itemize}
        \item $t=0$, in this case assumption 2. guarantees that with non-zero
            probability $u$ will be the only infected node at $t=0$.
        \item $t\geq 1$, in this case assumption 1. guarantees that with
            non-zero probability $u$ will be the only infected node at
            time-step $t$ (because the transitions in \cref{eq:markov} occur
            independently).
    \end{itemize}
    In both cases, it then follows from the form of \cref{eq:dist} after
    conditioning on $x^{t'}$ for $t' < t$ that the probability that $v$ will be
    infected at $t+1$ is different under $\Theta$ and under $\Theta'$.
\end{proof}


\begin{remark}
Intuitively, the assumptions of the proposition can be interpreted as follows:
assumption 2. prevents parent ambiguity at the source; assumption 1. prevents
parent ambiguity from occurring at later time-steps in the cascade even when
there is no ambiguity at the source; finally, assumption 3. prevents lack of
information.

When the graph $G$ is strongly connected, examples of simple source
distributions which satisfy the assumptions of the proposition include a single
source chosen uniformly at random among the vertices of the graph, or a source
where each node in $G$ is included independently with some probability $p>0$.

Finally, assumption 1 can be relaxed to a weaker assumption. Under this weaker
assumption, Proposition 2. becomes an exact characterization of the
identifiability of the model. Again, we defer the details to the final version
of this report.
\end{remark}

\section{Inference}

\subsection{Maximum-Likelihood Estimation}
\label{MLEsec}

It follows from the form of \cref{eq:dist} that the log-likelihood of $\Theta$
given cascade data can be written as the sum of $k$ terms, each term $i$ only
depending on $\bt_i$. Hence, each $\bt_i$ can be learnt separately by solving
a node-specific optimization problem. Specifically, for a given node $i$, if we
concatenate together all the time steps $t$ where this node was susceptible and
write $y^t = x^{t+1}_i$, \emph{i.e} whether or not the node became infected at
the next time step, the MLE estimator for $\bt_i$ is obtained by solving the
following optimization problem:
\begin{equation}
    \label{eq:mle}
    \hat{\theta}\in \argmax_\theta \sum_{t} y^t\log f(\theta\cdot x^t)
    + (1-y^t) \log \big(1 - f(\theta\cdot x^t)\big)
\end{equation}
It is interesting to note that at the node-level, doing MLE inference for the
GLC model is exactly amounts to fitting a Generalized Linear Model. When $f$ is
log-concave as is the case in most examples of GLC models, then the above
optimization problem becomes a convex optimization problem which can be solved
exactly and efficiently. The code to perform MLE estimation can be found in the
appendix, file \textsf{mle.py}.

Furthermore, as usual for an MLE estimator, we obtain that $\hat{\theta}$ is
asymptotically consistent under mild assumptions.

\begin{proposition}
    Under the assumptions of Proposition 2., and further assuming that
    $\lim_{z\to\infty} f(z) = 1$, the optimization program \eqref{eq:mle} has
    a unique solution $\hat{\theta}$ which is an asymptotically consistent
    estimator of $\bt_i$.
\end{proposition}

\subsection{Bayesian Approach}

\paragraph{Notation}
Let's adopt a Bayesian approach to the previous problem. To fix the notation
introduced above, we let $G=(V, \Theta)$ be a weighted directed graph with
$\Theta \in \R_+^{k \times k}$. We place a prior $\mathcal{D}$ on $\Theta$. Let
$x_0$ be a source node, sampled from distribution $p_s$ over the vertices $V$ of
the graph $\mathcal{G}$. Since we assume for now that the state of the source
node is entirely observed, the choice of $p_s$ is not relevant to the inference
of $\Theta$. Finally, we let $x_t$ (resp. $\mathbbm{1}_{S_t}$) be the binary
vectors encoding the infected (resp.~susceptible) nodes at time $t$:
\begin{alignat*}{2}
 \theta & \sim \mathcal{D}\qquad &&\text{(prior on $\theta$)} \\
 x_0 & \sim p_s &&\text{(random source distribution)}\\
 \forall t\geq 0, x_{t+1} | x_t, \theta &\sim Ber(f(\theta\cdot x_t)\cdot
\mathbbm{1}_{S_t})
 &&\text{(cascade process)}
\end{alignat*}

\paragraph{Graph prior}
Prior work has focused on MLE estimation regularized using lasso, summarized in
the section above. This is equivalent to choosing an independent Laplace prior
for $\theta$:
$$(\Theta_{i,j})\sim \prod_{i,j} Laplace(0,1)$$
In the context of social network learning however, we can place more informative
priors. We can:

\begin{itemize}
\item Take into account prior knowledge of edges' existence (or lack thereof)
\item Take into account common graph structures, such as triangles, clustering
\end{itemize}

A common prior for graph is the Exponential Random Graph Model (ERGM), which
allows flexible representations of networks and Bayesian inference. The
distribution of an ERGM family is defined by feature vector $s(G)$ and by the
probability distribution:
$$P(G | \Theta) \propto \exp \left( s(G)\cdot \Theta \right)$$

Though straightforward MCMC could be applied here, recent
work~\cite{caimo2011bayesian, koskinen2010analysing, robins2007recent} has shown
that ERGM inference has slow convergence and lack of robustness, developping
better alternatives to naive MCMC formulations. Experiments using such a prior
are ongoing, but we present results only for simple product distribution-type
priors here.

\paragraph{Inference}
We can sample from the posterior by MCMC\@ using the PyMC library, which works
relatively well (see paragraph on \emph{experiments}. If the prior is a product
distribution, then we can simply do Bayesian GLM regression, as in
Section~\ref{MLEsec}. However, computing the posterior edge-by-edge does not
accomodate for complex priors, which could encourage for example the presence of
triangles or two stars. For such priors, we must sample from the posterior of
the Markov chain directly, which is what we have done here.  This method is very
slow however, and scales badly as a result of the number of cascades and number
of nodes. We could greatly benefit from using an alternative method:
\begin{itemize}
\item EM\@. This approach was used in \cite{linderman2014discovering,
simma2012modeling} to learn
the parameters of a Hawkes process, a closely related inference problem.
\item Variational Inference. This approach was used
in~\cite{linderman2015scalable} as an extension to the paper cited in the
previous bullet point. Considering the scalabilty of their approach, we hope to
apply their method to our problem here, due to the similarity of the two
processes, and to the computational constraints of running MCMC over a large
parameter space.
\end{itemize}


\begin{figure}
\subfloat[][50 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:52:30.pdf}}
\subfloat[][100 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:52:47.pdf}}\\
\subfloat[][150 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:53:24.pdf}}
\subfloat[][200 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:55:39.pdf}}\\
\subfloat[][250 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:57:26.pdf}}
\subfloat[][1000 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:58:29.pdf}}
\caption{Bayesian Inference of $\Theta$ with MCMC using a $Beta(1, 1)$ prior on
each edge.  For each figure, the plot $(i, j)$ on the $i^{th}$ row and $j^{th}$
column represent a histogram of samples taken from the  posterior of the
corresponding edge $\Theta_{i, j}$. The red line indicates the true value of the
edge weight.  If an edge does not exist (has weight $0$) the red line is
confounded with the y axis.}
\label{betapriorbayeslearning}
\end{figure}

\paragraph{Experiments}

We ran some experiments on a simple network with 4 nodes with $\binom{4}{2}=6$
parameters to learn with the MCMC package PyMC\@. We plot in
Figure~\ref{betapriorbayeslearning} the progressive learning of $\Theta$ for
increasing numbers of observations. Of note, since the IC model does not include
self-loops, the diagonal terms are never properly learned, which is expected but
not undesirable. We notice that the existence or not of an edge is (relatively)
quickly learned, with the posterior on edges with no weight converging to $0$
after $100$ cascades. To get a concentrated posterior around the true non-zero
edge weigth requires $1000$ cascades, which is unreasonably high considering the
small number of parameters that we are learning in this experiment.

\subsection{Active Learning}

In this setup, $S$ is no longer drawn from a random distribution $p_s$ but is
chosen by the designer of the experiment. Once a source is drawn, a cascade is
drawn from the Markovian process, $\mathcal{D'}$.  We wish to choose the source
which maximises the information gain on the parameter $\Theta$, so as to speed
up learning. We suggest to choose the source which maximises the mutual
information between $\theta$ and $X$ at each time step. The mutual information
can be computed node-by-node by sampling:
\begin{equation*}
I((x_t)_{t\geq 1} ,\Theta | x_0 = i) = \mathbb{E}_{\substack{\Theta \sim D_t \\
x | \Theta, i \sim D'}}\log p(x|\Theta) - \mathbb{E}_{\substack{\Theta \sim D_t
\\ x' | \Theta, i \sim D'}} \log p(x')
\end{equation*}
The second term involves marginalizing over $\Theta$: $p(x') =
\mathbb{E}_{\Theta \sim D_t} p(x'| \Theta)$.

\begin{algorithm}
\caption{Active Bayesian Learning through Cascades (ABC)}
\begin{algorithmic}[1]
\State $\theta \sim \mathcal{D}_0 = \mathcal{D}$ \Comment{Initial prior on
$\theta$}
\While{True}
\State $i \leftarrow \arg\min_{i} I(\theta, (x_t)_{t \geq 1} | x_0 = i)$
\Comment{Maximize mutual information}
\State Observe $(x_t)_{t\geq1} |x_0 = i$ \Comment{Observe cascade from source}
\State $\mathcal{D}_{t+1} \gets \text{posterior of $\theta \sim \mathcal{D}_t$
given $(x_t)_{t\geq1}$}$ \Comment{Update posterior on $\theta$}
\EndWhile
\end{algorithmic}
\end{algorithm}

\bibliography{sparse}{}
\bibliographystyle{abbrv}

\end{document}