1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
|
\documentclass[final]{beamer}
\usepackage[utf8]{inputenc}
\usepackage[scale=1.7]{beamerposter} % Use the beamerposter package for laying
\usetheme{confposter} % Use the confposter theme supplied with this template
\usepackage{framed, amsmath, amsthm, amssymb}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{color, bbm}
\setbeamercolor{block title}{fg=dblue,bg=white} % Colors of the block titles
\setbeamercolor{block body}{fg=black,bg=white} % Colors of the body of blocks
\setbeamercolor{block alerted title}{fg=white,bg=dblue!70} % Colors of the
\setbeamercolor{block alerted body}{fg=black,bg=dblue!10} % Colors of the body
\newlength{\sepwid}
\newlength{\onecolwid}
\newlength{\twocolwid}
\newlength{\threecolwid}
\setlength{\paperwidth}{48in} % A0 width: 46.8in
\setlength{\paperheight}{36in} % A0 height: 33.1in
\setlength{\sepwid}{0.024\paperwidth} % Separation width (white space) between
\setlength{\onecolwid}{0.29\paperwidth} % Width of one column
\setlength{\twocolwid}{0.464\paperwidth} % Width of two columns
\setlength{\threecolwid}{0.708\paperwidth} % Width of three columns
\setlength{\topmargin}{-1in} % Reduce the top margin size
\title{Bayesian and Active Learning for Graph Inference} % Poster title
\author{Thibaut Horel\\[0.5em] Jean Pouget-Abadie} % Author(s)
\begin{document}
\setlength{\belowcaptionskip}{2ex} % White space under figures
\setlength\belowdisplayshortskip{2ex} % White space under equations
\begin{frame}[t]
\begin{columns}[t]
\begin{column}{\sepwid}\end{column}
\begin{column}{\onecolwid} % The first column
\begin{block}{Problem}
\emph{How to recover an unknown network from the observation of contagion
cascades?}
\vspace{1em}
\begin{itemize}
\item \textbf{Observe:} state (infected or not) of nodes over time.
\item \textbf{Objective:} learn $\Theta$, matrix of edge weights.
\end{itemize}
\end{block}
\vspace{2cm}
\begin{block}{\bf Contagion Model~\cite{Pouget:2015}}
\begin{itemize}
\item $X^t\in\{0,1\}^N$: state of the network at time $t$
\item At $t=0$, $X^0$ drawn from \emph{source distribution}
\item For $t=1,2,\dots$:
\begin{itemize}
\item $X^t$ only depends on $X^{t-1}$
\item for each node $j$, new state drawn independently with:
\begin{displaymath}
\mathbb{P}(X^{t+1}_j = 1 | X^t) = f(\Theta_j \cdot X^t)
\end{displaymath}
($f$: link function of the cascade model)
\end{itemize}
\end{itemize}
\vspace{1em}
\begin{figure}
\begin{center}
\includegraphics[scale=2.5]{drawing.pdf}
\end{center}
\end{figure}
\end{block}
\vspace{2cm}
\begin{block}{MLE}
\begin{itemize}
\item Define for node $j$, $y^t = x^{t+1}_j$
\item Observations $\{(x^t, y^t)\}$ are drawn from a GLM.~MLE problem:
\begin{equation*}
\begin{split}
\hat{\theta}\in \arg\max_\theta \sum_{t}~& y^t\log f(\Theta_j\cdot x^t)
\\ & + (1-y^t) \log \big(1 - f(\Theta_j\cdot x^t)\big)
\end{split}
\end{equation*}
Can be solved efficiently by SGD on $\Theta$.
\vspace{1cm}
\item log-likelihood is concave for common contagion models (\emph{e.g} IC
model) $\Rightarrow$ provable convergence guarantees \cite{Netrapalli:2012}.
\end{itemize}
\end{block}
\end{column} % End of the first column
%-----------------------------------------------------------------------------
\begin{column}{\sepwid}\end{column} % Empty spacer column
%-----------------------------------------------------------------------------
\begin{column}{\onecolwid} % The first column
\begin{block}{Bayesian Framework}
\begin{figure}
\centering
\vspace{-1em}
\includegraphics[scale=3]{graphical.pdf}
\end{figure}
{\bf Advantages:}
\begin{itemize}
\item prior encodes domain-specific knowledge of graph structure (e.g. ERGMs)
\item posterior expresses uncertainty over edge weights
\end{itemize}
{\bf Disadvantages:}
\begin{itemize}
\item hard to scale when large number of observations.
\end{itemize}
\end{block}
\begin{block}{Active Learning}
\emph{Can we learn faster by choosing the source node? If so, how to best
choose it?}
\begin{center}--~OR~--\end{center}
\emph{Can we cherry-pick the most relevant part of the dataset?}
\vspace{0.5em}
{\bf Idea:} Focus on parts of the graph which are unexplored (high uncertainty).
i.e.~maximize information gain per observation.
\vspace{0.5em}
\textbf{Baseline heuristic:}
\begin{itemize}
\item Choose source w.p.~proportional to estimated out-degree $\implies$ wider
cascades $\implies$ more data
\end{itemize}
\vspace{0.5em}
\textbf{Principled heuristic:}
\begin{itemize}
\item Choose source w.p.~proportional to mutual information
\begin{multline*}
p(i) \propto I\big(\{X_t\}_{t\geq 1} ,\Theta | X^0 = \{i\}\big)\\
= H(\Theta) - H\big(\Theta | \{X_t\}_{t\geq 1}, X_0 = \{i\}\big)
\end{multline*}
\item Requires knowing true distribution of $\{X_t\}$ $\Rightarrow$ use current
estimate of $\Theta$ instead.
\end{itemize}
\end{block}
\end{column}
%-----------------------------------------------------------------------------
\begin{column}{\sepwid}\end{column}
%-----------------------------------------------------------------------------
\begin{column}{\onecolwid} % The third column
\begin{block}{Implementation}
{\bf Scalable Bayesian Inference}
\begin{itemize}
\item MCMC: implements the graphical model directly with PyMC
\cite{pymc}; does not scale beyond 10 nodes.
\item VI: implements variational inference with Blocks~\cite{blocks}.
(wip: use Bohning bounds to avoid sampling).
\end{itemize}
\vspace{1em}
{\bf Scalable Active Learning}
Mutual information is expensive to compute. Instead:
\begin{itemize}
\item use mutual information between $\Theta$ and $X^1$.
\textbf{Justification} for any $f$:
\begin{displaymath}
I(X, Y) \geq I\big(f(X), Y\big)
\end{displaymath}
in our case: $f$ truncates the cascade at $t=1$.
(wip: obtain closed-form formula in this case).
\item variance is a lower-bound on mutual information $\implies$ pick
node $i$ w.p.~proportional to $\sum_j \text{Var}(\Theta_{i,j})$.
\end{itemize}
\end{block}
\begin{block}{Results}
\begin{center}
\includegraphics[scale=1.5]{../../simulation/plots/finale.png}
\end{center}
\end{block}
\begin{block}{References}
{\scriptsize \bibliography{sparse}
\bibliographystyle{abbrv}}
\end{block}
%-----------------------------------------------------------------------------
\end{column} % End of the third column
\end{columns} % End of all the columns in the poster
\end{frame} % End of the enclosing frame
\end{document}
|