finale/mid_report.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374

\documentclass[10pt]{article}
\usepackage[utf8x]{inputenc}
\usepackage{amsmath, amssymb, amsthm, microtype}
\usepackage{algpseudocode, algorithm, algorithmicx}
\usepackage[pagebackref=false,breaklinks=true,
            colorlinks=true,citecolor=blue]{hyperref}
\usepackage[capitalize, noabbrev]{cleveref}
\usepackage{graphicx, subfig}
\usepackage{bbm}
\usepackage{fullpage}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbf{R}}
\newcommand{\N}{\mathbf{N}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\eps}{\varepsilon}
\newcommand{\bt}{\boldsymbol{\theta}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\cl}[1]{\text{\textbf{#1}}}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}

\newtheorem{theorem}{Theorem}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\theoremstyle{remark}
\newtheorem*{example}{Example}
\newtheorem*{remark}{Remark}

\title{Network Inference from Cascades}
\author{Thibaut Horel \and Jean Pouget-Abadie}

\begin{document}

\maketitle

\begin{abstract}
    The Network Inference Problem (NIP) is the machine learning challenge of
    recovering the edges and edge weights of an unknown weighted graph from the
    observations of a random contagion process propagating over this graph.
    While estimators with provable convergence rate guarantees have been
    obtained under various formulations of the NIP, a rigorous statistical
    treatment of the problem is still lacking. In this work, we build upon the
    unified NIP formulation of [] to explore the connections between the
    topological properties of the graph to be learnt and the resulting quality
    of the estimators. Specifically, we analyze which properties of the graph
    render NIP unfeasible or hard, and which properties can be exploited to
    improve the quality of the estimators.
\end{abstract}

\section{Introduction}

Central question: \emph{relating properties of the graph to the difficulty of
doing graph inference from cascades.}

Two subquestions:
\begin{itemize}
    \item which properties of the graph make it impossible or hard to learn
        (identifiability, convergence rate)?
    \item which properties of the graph can be exploited to learn better or
        faster (use a Bayesian prior)?
\end{itemize}

\subsection*{Related work}
The study of edge prediction in graphs has been an active field of research for
over a decade~\cite{Nowell08, Leskovec07, AdarA05}.~\cite{GomezRodriguez:2010}
introduced the {\scshape Netinf} algorithm, which approximates the likelihood of
cascades represented as a continuous process.  The algorithm was improved in
later work~\cite{gomezbalduzzi:2011}, but is not known to have any theoretical
guarantees beside empirical validation on synthetic networks.
\cite{Netrapalli:2012} studied the discrete-time version of the independent
cascade model and obtained the first ${\cal O}(s^2 \log m)$ recovery guarantee
on general networks, by using unregularized MLE and making a {\it correlation
decay\/} assumption, which limits the number of new infections at every step. In
this setting, they show a lower bound of the number of cascades needed for
support recovery with constant probability of the order $\Omega(s \log (m/s))$.
They also suggest a {\scshape Greedy} algorithm, which achieves a ${\cal O}(s
\log m)$ guarantee in the case of tree graphs. The work of~\cite{Abrahao:13}
studies the same continuous-model framework as \cite{GomezRodriguez:2010} and
obtains an ${\cal O}(s^9 \log^2 s \log m)$ support recovery algorithm, without
the \emph{correlation decay} assumption.  \cite{du2013uncover,Daneshmand:2014}
propose Lasso regularization of MLE for recovering the weights of the graph
under a continuous-time independent cascade model. The work
of~\cite{du2014influence} is slightly orthogonal to ours since they suggest
learning the \emph{influence} function, rather than the parameters of the
network directly.

More recently, the inference of Hawkes process is perhaps the closest related
topic to our work, with recent papers \cite{linderman2014discovering,
linderman2015scalable} , focusing on MLE, EM, and Variational
Inference methods to learning these processes.

\section{Model}

Weighted directed graph $G = (V, \Theta)$. $k=|V|$ is the number of nodes.
$\Theta\in\R_{+}^{k\times k}$. $\Theta$ implicitly defines the edge set $E$ of
the graph.

Discrete time model, $t\in\N$.  Nodes can be in one of two states:
\emph{susceptible}, \emph{infected} or \emph{immune}.  $S_t:$ nodes susceptible
at the beginning of time step $t\in\N$.  $I_t$ nodes infected at time step $t$.
\begin{displaymath}
    S_0 = V,\quad S_{t+1} = S_t \setminus I_t
\end{displaymath}
Nodes which are no longer susceptible are immune.

The dynamics of $I_t$ is described by a random Markovian process. Let us denote
by $x_t$ the indicator vector of $I_t$ at time step $t\in\N$. $x_0$ is drawn
from the \emph{source distribution} $p_s:\{0,1\}^n\to[0,1]$. For $t\geq 1$,
Markovian process:
\begin{equation}
    \label{eq:markov}
    \forall i\in S_t,\quad
    \P\big(x_{i}^{t} = 1\,|\, x^{t-1}\big) = f(\bt_i\cdot x^{t-1})
\end{equation}
$\bt_i$ is the $i$th column of $\Theta$, $f:\R\to[0,1]$, plus independence for
$i$. A cascade continues until no more infected nodes. 

\section{Identifiability}

A given source distribution $p_s$ and \cref{eq:markov} completely specifies the
distribution $p$ of a cascade $\mathbf{x} = (x_t)_{t\geq 0}$:
\begin{displaymath}
    p(\bx;\Theta) = p_s(x^0)\prod_{t\geq 1}\prod_{i\in S_t}
    f(\bt_i\cdot x^{t-1})^{x^t_i}\big(1-f(\theta_i\cdot x^{t-1})\big)^{1-x_i^t}
\end{displaymath}

We see this model as parametrized by $\Theta$. $f$ and $p_s$ are considered
fixed.

\begin{definition}
    We say that the model $\mathcal{P}
    = \big\{p(\cdot\,;\,\Theta)\,|\,\Theta\in\R_+^{k\times k}\big\}$ is
    identifiable iff:
    \begin{displaymath}
        \forall \Theta,\Theta'\in\R_+^{k\times k},\quad
        p(\cdot\,;\,\Theta) = p(\cdot\, ;\, \Theta') \Rightarrow \Theta = \Theta'
    \end{displaymath}
\end{definition}

In our case, identifiability is not guaranteed. In fact there are two factors
which can lead to unidentifiability: lack of information or ambiguity. We
illustrate both factors in the subsequent two examples.

\begin{figure}
\begin{center}
    \includegraphics[scale=0.8]{example.pdf}
\end{center}
\caption{Example of a graph which can be unidentifiable, both because of lack
    of information or parent ambiguity if the source distribution is chosen
adversarially.}
\label{fig:ident}
\end{figure}

\vspace{.5em}

\begin{example}[Lack of information]
    Let us consider the graph in \cref{fig:ident} and the source distribution
    $p_s$ defined by $p_s\big((0, 1, 0)\big) = 1$. In other words, node $2$ is
    always the only infected node at $t=0$. Then it is clear that all
    casacades die at $t=1$ at which node 3 becomes infected with probability
    $f(\theta_{2,3})$. This probability does not depend on $\theta_{1,3}$.
    Hence all matrices $\Theta$ compatible with the graph and for which
    $\theta_{2,3}$ is fixed yield the same cascade distribution.
\end{example}

\vspace{.5em}

\begin{example}[Parent ambiguity]
Let us consider again the graph in \cref{fig:ident} and let us consider the
case where the source distribution is such that $p_s\big((1,1,0)\big) = 1$,
\emph{i.e} nodes 1 and 2 are always jointly infected at time $t=0$. Then it is
clear that cascades will all die at time $t=1$ where node 3 becomes infected
with probability $f(\theta_{1,3} + \theta_{2,3})$. This implies that all
matrices $\Theta$ which are compatible with the graph in \cref{fig:ident}
(defining the same edge set) and for which $\theta_{1,3} + \theta_{2,3} = c$
for some $c\in \R_+$ define the same cascade probability distribution.
\end{example}

\vspace{.5em}

The examples show that for a model to be identifiable, we need conditions on
the source distribution, in particular in relation to how well it plays with
the reachability structure of the graph.

We will abuse the notation $p_s$ and for $u\in V$ write:
\begin{displaymath}
    p_s(w) \eqdef \sum_{\bx: \bx_w = 1} p_s(\bx)
\end{displaymath}
Furthermore we write $u\leadsto v$ if there is a directed path from $u$ to $v$
in $G$ and $u\leadsto v\leadsto w$ if there is a directed path from $u$ to $w$
passing through $v$ in $G$.

\begin{proposition}
Under the assumptions:
\begin{enumerate}
    \item $\forall z\in \mathbf{\R_+},\; f(z)< 1$
    \item for all $S\subseteq V$ with $|S|\geq 2$, $p_s(S)<1$.
    \item $\forall (u,v)\in V^2$, there exists $w$ such that $p_s(w)\neq 0$ and
        $w\leadsto u\leadsto v$.
\end{enumerate}
then the model is identifiable.
\end{proposition}

2. prevents parent ambiguity at the source. 1. prevents parent ambiguity from
   occurring later on in the cascade even when there is no ambiguity at the
   source (could be relaxed to ``absence of forks''). 3. prevents lack of
   information.

\begin{proof}

\end{proof}

\begin{remark} The conditions are the weakest under which there is
    identifiability (we can show that they are necessary and sufficient).
    Reasonable source distributions which imply the conditions are uniform
    random, independent Bernouilli.
\end{remark}

\section{Inference}

\subsection{Maximum-Likelihood Estimation}

Because transitions occur independently for each node $i$, the log-likelihood
is a sum of $k$ terms, each term $i$ only depending on $\bt_i$. Hence, each
$\bt_i$ can be learnt separately by solving a node-specific optimization
problem:

We assume $f$ log-concave to have a concave optimization problem. Depending on
the model, $\theta_{i,j}$ can either be restricted to live in a compact subset
of $\R_+$. Otherwise, we need to assume that $\lim_{z\to\infty} f(z) = 1$.
Then, we can apply standard results on consistency and asymptotic normality of
MLE estimator.

\subsection{Bayesian Approach}

\paragraph{Notation}
Let's adopt a Bayesian approach to the previous problem. To fix the notation
introduced above, we let $G=(V, \Theta)$ be a weighted directed graph with
$\Theta \in \R_+^{k \times k}$. We place a prior $\mathcal{D}$ on $\Theta$. Let
$x_0$ be a source node, sampled from distribution $p_s$ over the vertices $V$ of
the graph $\mathcal{G}$. Since we assume for now that the state of the source
node is entirely observed, the choice of $p_s$ is not relevant to the inference
of $\Theta$. Finally, we let $x_t$ (resp. $\mathbbm{1}_{S_t}$) be the binary
vectors encoding the infected (resp.~susceptible) nodes at time $t$:
\begin{alignat*}{2}
 \theta & \sim \mathcal{D}\qquad &&\text{(prior on $\theta$)} \\
 x_0 & \sim p_s &&\text{(random source distribution)}\\
 \forall t\geq 0, x_{t+1} | x_t, \theta &\sim Ber(f(\theta\cdot x_t)\cdot
\mathbbm{1}_{S_t})
 &&\text{(cascade process)}
\end{alignat*}

\paragraph{Graph prior}
Prior work has focused on MLE estimation regularized using lasso, summarized in
the section above. This is equivalent to choosing an independent Laplace prior
for $\theta$:
$$(\Theta_{i,j})\sim \prod_{i,j} Laplace(0,1)$$
In the context of social network learning however, we can place more informative
priors. We can:

\begin{itemize}
\item Take into account prior knowledge of edges' existence (or lack thereof)
\item Take into account common graph structures, such as triangles, clustering
\end{itemize}

A common prior for graph is the Exponential Random Graph Model (ERGM), which
allows flexible representations of networks and Bayesian inference. The
distribution of an ERGM family is defined by feature vector $s(G)$ and by the
probability distribution:
$$P(G | \Theta) \propto \exp \left( s(G)\cdot \Theta \right)$$

Though straightforward MCMC could be applied here, recent
work~\cite{caimo2011bayesian, koskinen2010analysing, robins2007recent} has shown
that ERGM inference has slow convergence and lack of robustness, developping
better alternatives to naive MCMC formulations. Experiments using such a prior
are ongoing, but we present only simple product distribution-type priors here.

\paragraph{Inference}
We can sample from the posterior by MCMC\@. This might not be the fastest
solution however. We could greatly benefit from using an alternative method:
\begin{itemize}
\item EM\@. This approach was used in \cite{linderman2014discovering,
simma2012modeling} to learn
the parameters of a Hawkes process, a closely related inference problem.
\item Variational Inference. This approach was used
in~\cite{linderman2015scalable} as an extension to the paper cited in the
previous bullet point. Considering the scalabilty of their approach, we hope to
apply their method to our problem here, due to the similarity of the two
processes, and to the computational constraints of running MCMC over a large
parameter space.
\end{itemize}


\begin{figure}
\subfloat[][50 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:52:30.pdf}}
\subfloat[][100 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:52:47.pdf}}\\
\subfloat[][150 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:53:24.pdf}}
\subfloat[][200 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:55:39.pdf}}\\
\subfloat[][250 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:57:26.pdf}}
\subfloat[][1000 cascades]{
\includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:58:29.pdf}}
\caption{Bayesian Inference of $\Theta$ with MCMC using a $Beta(1, 1)$ prior.
For each figure, the plot $(i, j)$ on the $i^{th}$ row and $j^{th}$ column
represent a histogram of samples taken from the  posterior of the corresponding
edge $\Theta_{i, j}$. The red line indicates the true value of the edge weight.
If an edge does not exist (has weight $0$) the red line is confounded with the y
axis.}
\label{betapriorbayeslearning}
\end{figure}

\paragraph{Experiments}

We ran some experiments on a simple network with 4 nodes with $\binom{4}{2}=6$
parameters to learn with the MCMC package PyMC\@. We plot in
Figure~\ref{betapriorbayeslearning} the progressive learning of $\Theta$ for
increasing numbers of observations. Of note, since the IC model does not include
self-loops, the diagonal terms are never properly learned, which is expected but
not undesirable. We notice that the existence or not of an edge is (relatively)
quickly learned, with the posterior on edges with no weight converging to $0$
after $100$ cascades. To get a concentrated posterior around the true non-zero
edge weigth requires $1000$ cascades, which is unreasonably high considering the
small number of parameters that we are learning in this experiment.

\subsection{Active Learning}

In this setup, $S$ is no longer drawn from a random distribution $p_s$ but is
chosen by the designer of the experiment. Once a source is drawn, a cascade is
drawn from the Markovian process, $\mathcal{D'}$.  We wish to choose the source
which maximises the information gain on the parameter $\Theta$, so as to speed
up learning. We suggest to choose the source which maximises the mutual
information between $\theta$ and $X$ at each time step. The mutual information
can be computed node-by-node by sampling:
\begin{equation*}
I((x_t)_{t\geq 1} ,\Theta | x_0 = i) = \mathbb{E}_{\substack{\Theta \sim D_t \\
x | \Theta, i \sim D'}}\log p(x|\Theta) - \mathbb{E}_{\substack{\Theta \sim D_t
\\ x' | \Theta, i \sim D'}} \log p(x')
\end{equation*}
The second term involves marginalizing over $\Theta$: $p(x') =
\mathbb{E}_{\Theta \sim D_t} p(x'| \Theta)$.

\begin{algorithm}
\caption{Active Bayesian Learning through Cascades (ABC)}
\begin{algorithmic}[1]
\State $\theta \sim \mathcal{D}_0 = \mathcal{D}$ \Comment{Initial prior on
$\theta$}
\While{True}
\State $i \leftarrow \arg\min_{i} I(\theta, (x_t)_{t \geq 1} | x_0 = i)$
\Comment{Maximize mutual information}
\State Observe $(x_t)_{t\geq1} |x_0 = i$ \Comment{Observe cascade from source}
\State $\mathcal{D}_{t+1} \gets \text{posterior of $\theta \sim \mathcal{D}_t$
given $(x_t)_{t\geq1}$}$ \Comment{Update posterior on $\theta$}
\EndWhile
\end{algorithmic}
\end{algorithm}

\bibliography{sparse}{}
\bibliographystyle{alpha}

\end{document}