paper/sections/model.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

We consider a graph ${\cal G}= (V, E, \Theta)$, where $\Theta$ is a $|V|\times |V|$ matrix of parameters describing the edge weights of $\mathcal{G}$. Let $m\defeq |V|$. For each node $j$, let $\Theta_{j}$ be the $j^{th}$ column vector of $\Theta$. Intuitively, $\Theta_{i,j}$ captures the ``influence'' of node $i$ on node $j$. A \emph{Cascade model} is a Markov process over a finite state space $\{0, 1, \dots, K-1\}^V$ with the following properties:
\begin{enumerate}
\item Conditioned on the previous time step, the transition probabilities for each node $i \in V$ are mutually independent.
\item Of the $K$ possible states, there exists a \emph{contagious state} such that all transition probabilities of the Markov process are either constant or can be expressed as a function of the graph parameters $\Theta$ and the set of ``contagious nodes'' at the previous time step.
\item The initial probability over $\{0, 1, \dots, K-1\}^V$ of the Cascade Model is such that all nodes can eventually reach a \emph{contagious state} with non-zero probability. The ``contagious'' nodes at $t=0$ are called \emph{source nodes}.
\end{enumerate}

In other words, a cascade model describes a diffusion process, whereby a set of contagious nodes ``influence'' other nodes in the graph to adopt or not their behavior. An \emph{influence cascade} is a realisation of this random process: the successive states of the nodes in graph ${\cal G}$. Note that both the ``single source'' assumption made in \cite{Daneshmand:2014} and \cite{Abrahao:13} as well as the ``uniformly chosen source set'' assumption made in \cite{Netrapalli:2012} verify condition 3.

In the context of Graph Inference, \cite{Netrapalli:2012} focuses specifically on the well-known discrete-time independent cascade model recalled below, which \cite{Abrahao:13} and \cite{Daneshmand:2014} generalize to continuous time. Whilst we restrict ourselves to discrete-time processes, we consider both the independent cascade model and the linear voter model. Despite their obvious differences, both models make the graph inference problem very similar to the standard generalized linear model estimation problem. In fact, we can define a class of diffusion processes which for which this true: the \emph{Generalized Linear Cascade Models}. The linear threshold model is a special case and is discussed in Section~\ref{sec:linear_threshold}.

\subsection{Generalized Linear Cascade Models}
\label{sec:GLC}

Let \emph{susceptible} characterize any state which can become contagious at the next time step with a non-zero probability. We draw inspiration from generalized linear models to introduce Generalized Linear Cascades:

\begin{definition}
\label{def:glcm}
Let $X^t$ be the indicator variable of ``contagious nodes'' at time step $t$. A \emph{generalized linear cascade model} is a cascade model such that for each susceptible node $j$ in state $s$ at time step $t$, the probability of $j$ becoming ``contagious'' at time step $t+1$ conditionally on $X^t$ is a Bernoulli of parameter $f_s(\Theta_j \cdot X^t)$:
$$\mathbb{P}(X^{t+1}_j = 1|X^t \text{and } e^t_j = s) = f_s(\Theta_j \cdot X^t)$$
where $e^t_j$ is the state of node $j$ at time step $t$ and $f_s : \mathbb{R} \rightarrow [0,1]$
\end{definition}

In other words, each generalized linear cascade provides a series for each node $j \in V$ a series of measurements $(X^t, X^t_j)_{t \in {\cal T_j}}$ issued from a generalized linear model. Note also that $\E[X^{t+1}_i\,|\,X^t] = f(\inprod{\theta_i}{X^t})$. As such, $f$ can be interpreted as the inverse link function of our generalized linear cascade model.

% Denoting by $X^t$ the state of the cascade at time step $t$, we interpret
% $X^t_j = 1$ for node $j\in V$ as ``node $j$ is contagious'', \emph{i.e}, ``$j$
% exhibits the source nodes' behavior at time step $t$''.  We draw inspiration
% from \emph{generalized linear models} (GLM) to define a generalized linear
% cascade.

% \begin{definition}
%     \label{def:glcm} 
%     Let us denote by $\{\mathcal{F}_t, t\in\ints\}$ the natural
%     filtration induced by $\{X_t, t\in\ints\}$. A \emph{generalized linear
%     cascade} is characterized by the following equation:
%     \begin{displaymath}
%         \P[X^{t+1}=x\,|\, \mathcal{F}_t] =
%         \prod_{i=1}^m f(\inprod{\theta_i}{X^{t}})^{x_i}
%         \big(1-f(\inprod{\theta_i}{X^{t}}\big)^{1-x_i}
%     \end{displaymath}
% where $f:\mathbb{R}\to[0,1]$ and $\theta_i$ is the $i$-th column of $\Theta$.
% \end{definition}

% It follows immediately from this definition that a generalized linear cascade
% satisfies the Markov property:
% \begin{displaymath}
%     \P[X^{t+1}=x\,|\mathcal{F}_t] = \P[X^{t+1}=x\,|\, X^t]
% \end{displaymath}
% Note also that $\E[X^{t+1}_i\,|\,X^t] = f(\inprod{\theta_i}{X^t})$. As such,
% $f$ can be interpreted as the inverse link function of our generalized linear
% cascade model.

\subsection{Examples}

\subsubsection{Independent Cascade Model}

In the independent cascade model, nodes can be either susceptible, contagious or
immune. At $t=0$, all source nodes are ``contagious'' and all remaining nodes are
``susceptible''. At each time step $t$, for each edge $(i,j)$ where $j$ is
susceptible and $i$ is contagious, $i$ attempts to infect $j$ with probability
$p_{i,j}\in[0,1]$, the infection attempts are independent of each other. If $i$
succeeds, $j$ will become contagious at time step $t+1$. Regardless of $i$'s
success, node $i$ will be immune at time $t+1$. In other words, nodes stay
contagious for only one time step. The cascade process terminates when no contagious nodes remain.

If we denote by $X^t$ the indicator variable of the set of contagious nodes at time
step $t$, then if $j$ is susceptible at time step $t+1$, we have:
\begin{displaymath}
    \P\big[X^{t+1}_j = 1\,|\, X^{t}\big]
    = 1 - \prod_{i = 1}^m (1 - p_{i,j})^{X^t_i}.
\end{displaymath}
Defining $\Theta_{i,j} \defeq \log(1-p_{i,j})$, this can be rewritten as:
\begin{equation}\label{eq:ic}
    \tag{IC}
    \P\big[X^{t+1}_j = 1\,|\, X^{t}\big]
    = 1 - \prod_{i = 1}^m e^{\Theta_{i,j}X^t_i}
    = 1 - e^{\inprod{\theta_j}{X^t}}
\end{equation}

Therefore, the independent cascade model is a Generalized Linear Cascade model
with canonical link function $f : z \mapsto 1 - e^z$.

\subsubsection{The Linear Voter Model}

In the Linear Voter Model, nodes can be either \emph{red} or \emph{blue}. Both states are symetric, but without loss of generality, we can suppose that the \emph{blue} nodes are contagious and the \emph{red} nodes are susceptible. The parameters of the graph are normalized such that $\forall i,
\ \sum_j \Theta_{i,j} = 1$. Each round, every node $j$ independently chooses
one of its neighbors with probability $\Theta_{ij}$ and adopts their color. The
cascades stops at a fixed horizon time T or if all nodes are of the same color.
If we denote by $X^t$ the indicator variable of the set of blue nodes at time
step $t$, then we have:
\begin{equation}
\mathbb{P}\left[X^{t+1}_j = 1 | X^t \right] = \sum_{i=1}^m \Theta_{i,j} X_i^t = \inprod{\Theta_j}{X^t}
\tag{V}
\end{equation}

Therefore, the independent cascade model is a Generalized Linear Cascade model
with canonical link function $f: z \mapsto z$.

% \subsection{The Linear Threshold Model}

% In the Linear Threshold Model, each node $j\in V$ has a threshold $t_j$ from the interval $[0,1]$ and for each node, the sum of incoming weights is less than $1$: $\forall j\in V$, $\sum_{i=1}^m \Theta_{i,j} \leq 1$. 

% Nodes have two states: susceptible or contagious. At each time step, each susceptible
% node $j$ becomes contagious if the sum of the incoming weights from contagious parents is greater than $j$'s threshold $t_j$. Nodes remain contagious until the end of the cascade, which is reached when no new susceptible nodes become contagious.

% As such, the source nodes are chosen, the process is entirely deterministic. Let $X^t$ be the indicator variable of the set of contagious nodes at time step $t-1$, then:
% \begin{equation}
% \nonumber
%     X^{t+1}_j = \mathbbm{1}_{\sum_{i=1}^m \Theta_{i,j}X^t_i > t_j}
%     = \mathbbm{1}_{\inprod{\theta_j}{X^t} > t_j}
% \end{equation}
% where we defined again $\theta_j\defeq (\Theta_{1,j},\ldots,\Theta_{m,j})$. In other words, for every step in the linear threshold model, we observe the following signal:

% \begin{equation}
%     \label{eq:lt}
%     \tag{LT}
%     \mathbb{P} \left[X^{t+1}_j = 1 | X^t\right] = \text{sign} \left(\inprod{\theta_j}{X^t} - t_j \right)
% \end{equation}
% where ``sign'' is the function $\mathbbm{1}_{\cdot > 0}$.


\subsection{Maximum Likelihood Estimation}
\label{sec:mle}

Recovering the model parameter $\Theta$ from observed influence cascades is the
central question of the present work. Recovering the edges in $E$ from observed
influence cascades is a well-identified problem known as the \emph{Graph
Inference} problem. However, recovering the influence parameters is no less
important and has been seemingly overlooked so far. In this work we focus on
recovering $\Theta$, noting that the set of edges $E$ can then be recovered
through the following equivalence:
\begin{displaymath}
    (i,j)\in E\Leftrightarrow  \Theta_{i,j} \neq 0
\end{displaymath}

Given observations $(x^1,\ldots,x^n)$ of a cascade model, we can recover
$\Theta$ via Maximum Likelihood Estimation (MLE). Denoting by $\mathcal{L}$ the
log-likelihood function, we consider the following $\ell_1$-regularized MLE
problem:
\begin{displaymath}
    \hat{\Theta} \in \argmax_{\Theta} \frac{1}{n} \mathcal{L}(\Theta\,|\,x^1,\ldots,x^n)
    - \lambda\|\Theta\|_1
\end{displaymath}
where $\lambda$ is the regularization factor which helps at preventing
overfitting as well as controlling for the sparsity of the solution.

The generalized linear cascade model is decomposable in the following sense:
given Definition~\ref{def:glcm}, the log-likelihood can be written as the sum
of $m$ terms, each term $i\in\{1,\ldots,m\}$ only depending on $\theta_i$.
Since this is equally true for $\|\Theta\|_1$, each column $\theta_i$ of
$\Theta$ can be estimated by a separate optimization program:
\begin{equation}\label{eq:pre-mle}
    \hat{\theta}_i \in \argmax_{\theta} \mathcal{L}_i(\theta_i\,|\,x^1,\ldots,x^n)
    - \lambda\|\theta_i\|_1
\end{equation}
where we denote by ${\cal T}_i$ the time steps at which node $i$ is susceptible and:
\begin{multline*}
    \mathcal{L}_i(\theta_i\,|\,x^1,\ldots,x^n) = \frac{1}{|{\cal T}_i|} \sum_{t\in {\cal T}_i } x_i^{t+1}\log f(\inprod{\theta_i}{x^{t}}) \\
    + (1 - x_i^{t+1})\log\big(1-f(\inprod{\theta_i}{x^t})\big)
\end{multline*}

A sufficient condition for program~\eqref{eq:pre-mle} to be a convex program is
that $\mathcal{L}_i$ is a concave function. This will be the case if, for
example, $f$ and $(1-f)$ are both log-concave functions. Remember that
a twice-differentiable function $f$ is log concave iff. $f''f \leq f'^2$. It is
easy to verify this property for $f$ and $(1-f)$ in the Independent Cascade
Model and Voter Model.

Note that in the case of the voter model, TODO horizon time. In the case of the independent cascade model...