finale/mid_report.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205

\documentclass[10pt]{article}
\usepackage[utf8x]{inputenc}
\usepackage{amsmath, amssymb, amsthm, microtype}
\usepackage[pagebackref=false,breaklinks=true,colorlinks=true,citecolor=blue]{hyperref}
\usepackage[capitalize, noabbrev]{cleveref}
\usepackage{graphicx}
\usepackage{bbm}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbf{R}}
\newcommand{\N}{\mathbf{N}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\eps}{\varepsilon}
\newcommand{\bt}{\boldsymbol{\theta}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\cl}[1]{\text{\textbf{#1}}}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}

\newtheorem{theorem}{Theorem}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\theoremstyle{remark}
\newtheorem*{example}{Example}
\newtheorem*{remark}{Remark}

\title{Regression Analysis with Network Data}
\author{Thibaut Horel \and Jean Pouget-Abadie}

\begin{document}

\maketitle

\begin{abstract}

\end{abstract}

\section{Introduction}

Central question: \emph{relating properties of the graph to the difficulty of
doing graph inference from cascades.}

Two subquestions:
\begin{itemize}
    \item which properties of the graph make it impossible or hard to learn
        (identifiability, convergence rate)?
    \item which properties of the graph can be exploited to learn better or
        faster (use a Bayesian prior)?
\end{itemize}

\section{Model}

Weighted directed graph $G = (V, \Theta)$. $k=|V|$ is the number of nodes.
$\Theta\in\R_{+}^{k\times k}$. $\Theta$ implicitly defines the edge set $E$ of
the graph.

Discrete time model, $t\in\N$.  Nodes can be in one of two states:
\emph{susceptible}, \emph{infected} or \emph{immune}.  $S_t:$ nodes susceptible
at the beginning of time step $t\in\N$.  $I_t$ nodes infected at time step $t$.
\begin{displaymath}
    S_0 = V,\quad S_{t+1} = S_t \setminus I_t
\end{displaymath}
Nodes which are no longer susceptible are immune.

The dynamics of $I_t$ is described by a random Markovian process. Let us denote
by $x_t$ the indicator vector of $I_t$ at time step $t\in\N$. $x_0$ is drawn
from the \emph{source distribution} $p_s:\{0,1\}^n\to[0,1]$. For $t\geq 1$,
Markovian process:
\begin{equation}
    \label{eq:markov}
    \forall i\in S_t,\quad
    \P\big(x_{i}^{t} = 1\,|\, x^{t-1}\big) = f(\bt_i\cdot x^{t-1})
\end{equation}
$\bt_i$ is the $i$th column of $\Theta$, $f:\R\to[0,1]$, plus independence for
$i$. A cascade continues until no more infected nodes. 


\section{Identifiability}

A given source distribution $p_s$ and \cref{eq:markov} completely specifies the
distribution $p$ of a cascade $\mathbf{x} = (x_t)_{t\geq 0}$:
\begin{displaymath}
    p(\bx;\Theta) = p_s(x^0)\prod_{t\geq 1}\prod_{i\in S_t}
    f(\bt_i\cdot x^{t-1})^{x^t_i}\big(1-f(\theta_i\cdot x^{t-1})\big)^{1-x_i^t}
\end{displaymath}

We see this model as parametrized by $\Theta$. $f$ and $p_s$ are considered
fixed.

\begin{definition}
    We say that the model $\mathcal{P}
    = \big\{p(\cdot\,;\,\Theta)\,|\,\Theta\in\R_+^{k\times k}\big\}$ is
    identifiable iff:
    \begin{displaymath}
        \forall \Theta,\Theta'\in\R_+^{k\times k},\quad
        p(\cdot\,;\,\Theta) = p(\cdot\, ;\, \Theta') \Rightarrow \Theta = \Theta'
    \end{displaymath}
\end{definition}

In our case, identifiability is not guaranteed. In fact there are two factors
which can lead to unidentifiability: lack of information or ambiguity. We
illustrate both factors in the subsequent two examples.

\begin{figure}
\begin{center}
    \includegraphics[scale=0.8]{example.pdf}
\end{center}
\caption{Example of a graph which can be unidentifiable, both because of lack
    of information or parent ambiguity if the source distribution is chosen
adversarially.}
\label{fig:ident}
\end{figure}

\vspace{.5em}

\begin{example}[Lack of information]
    Let us consider the graph in \cref{fig:ident} and the source distribution
    $p_s$ defined by $p_s\big((0, 1, 0)\big) = 1$. In other words, node $2$ is
    always the only infected node at $t=0$. Then it is clear that all
    casacades die at $t=1$ at which node 3 becomes infected with probability
    $f(\theta_{2,3})$. This probability does not depend on $\theta_{1,3}$.
    Hence all matrices $\Theta$ compatible with the graph and for which
    $\theta_{2,3}$ is fixed yield the same cascade distribution.
\end{example}

\vspace{.5em}

\begin{example}[Parent ambiguity]
Let us consider again the graph in \cref{fig:ident} and let us consider the
case where the source distribution is such that $p_s\big((1,1,0)\big) = 1$,
\emph{i.e} nodes 1 and 2 are always jointly infected at time $t=0$. Then it is
clear that cascades will all die at time $t=1$ where node 3 becomes infected
with probability $f(\theta_{1,3} + \theta_{2,3})$. This implies that all
matrices $\Theta$ which are compatible with the graph in \cref{fig:ident}
(defining the same edge set) and for which $\theta_{1,3} + \theta_{2,3} = c$
for some $c\in \R_+$ define the same cascade probability distribution.
\end{example}

\vspace{.5em}

The examples show that for a model to be identifiable, we need conditions on
the source distribution, in particular in relation to how well it plays with
the reachability structure of the graph.

We will abuse the notation $p_s$ and for $u\in V$ write:
\begin{displaymath}
    p_s(w) \eqdef \sum_{\bx: \bx_w = 1} p_s(\bx)
\end{displaymath}
Furthermore we write $u\leadsto v$ if there is a directed path from $u$ to $v$
in $G$ and $u\leadsto v\leadsto w$ if there is a directed path from $u$ to $w$
passing through $v$ in $G$.

\begin{proposition}
Under the assumptions:
\begin{enumerate}
    \item $\forall z\in \mathbf{\R_+},\; f(z)< 1$
    \item for all $S\subseteq V$ with $|S|\geq 2$, $p_s(S)<1$.
    \item $\forall (u,v)\in V^2$, there exists $w$ such that $p_s(w)\neq 0$ and
        $w\leadsto u\leadsto v$.
\end{enumerate}
then the model is identifiable.
\end{proposition}

2. prevents parent ambiguity at the source. 1. prevents parent ambiguity from
   occurring later on in the cascade even when there is no ambiguity at the
   source (could be relaxed to ``absence of forks''). 3. prevents lack of
   information.

\begin{proof}

\end{proof}

\begin{remark} The conditions are the weakest under which there is
    identifiability (we can show that they are necessary and sufficient).
    Reasonable source distributions which imply the conditions are uniform
    random, independent Bernouilli.
\end{remark}

\section{Inference}

\subsection{Maximum-Likelihood Estimation}

Because transitions occur independently for each node $i$, the log-likelihood
is a sum of $k$ terms, each term $i$ only depending on $\bt_i$. Hence, each
$\bt_i$ can be learnt separately by solving a node-specific optimization
problem:

We assume $f$ log-concave to have a concave optimization problem. Depending on
the model, $\theta_{i,j}$ can either be restricted to live in a compact subset
of $\R_+$. Otherwise, we need to assume that $\lim_{z\to\infty} f(z) = 1$.
Then, we can apply standard results on consistency and asymptotic normality of
MLE estimator.

\subsection{Bayesian Approach}


\end{document}