summaryrefslogtreecommitdiffstats
path: root/ic_theory/main.tex
blob: eb59c6ee3be3a50df685e730e9fcd57a84b7ad67 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
\documentclass[10pt]{article}
\usepackage[utf8x]{inputenc}
\usepackage{amsmath,amsfonts,amsthm}
\usepackage[english]{babel}
\usepackage[capitalize, noabbrev]{cleveref}
\usepackage{algorithm}
\usepackage{algpseudocode}

\newenvironment{enumerate*}%
  {\vspace{-2ex} \begin{enumerate} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{enumerate}}
 
\newenvironment{itemize*}%
  {\vspace{-2ex} \begin{itemize} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{itemize}}
 
\newenvironment{description*}%
  {\vspace{-2ex} \begin{description} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{description}}

\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbf{R}}
\newcommand{\N}{\mathbf{N}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\eps}{\varepsilon}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
\newcommand{\llbracket}{[\![}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\algrenewcommand\algorithmicensure{\textbf{Output:}}
\algrenewcommand\algorithmicrequire{\textbf{Input:}}
\DeclareMathOperator*{\argmax}{arg\,max}

\author{}
\title{}
\date{}

\begin{document}

\section{Probabilistic Model}
During the last decade, a growing body of work has focused on the graph
inference problem: how can an unknown graph be recovered from the observation
of cascades propagating along it? (CITE). While our probabilistic model for
cascades of violence strongly relies on this line of work, we depart from it in
the following ways:
\begin{itemize}
    \item in our case, the graph along which the cascades of violence propagate
        (the co-offending network) is known, and we are only interested in
        recovering (and ultimately predicting) the edges of the cascade, that
        is, identifying who influenced whom.
    \item the previous models treat multiple cascades as independent
        observations which occur sequentially. In our case, cascades can die
        and spawn at all time and temporally overlap. This need to be carefully
        accounted for in our generative model and the resulting inference
        algorithms.
\end{itemize}

\subsection{Pairwise Propagation}
\label{sec:edge}

For a pair $(u,v)$ of nodes, we model the propagation of the cascade
from $u$ to $v$ as the sequence of the following two probabilistic events:
\begin{enumerate}

    \item \emph{structural propagation:} a Bernouilli variable dictates whether
        or not a propagation occurs. The parameter of this Bernouilli variable
        is computed from the edge weights of the co-offending network as
        follows:
        \begin{equation}\label{eq:structural}
            p_{u,v}^s \eqdef \prod_{e\in P(u,v)}
            \frac{1}{1+e^{-\frac{w_e}{\lambda}}}
        \end{equation}
        where $P(u,v)$ denotes the shortest path from $u$ to $v$ and $\lambda$
        is the \emph{structural parameter}. That is, the weight $w_e$ of each
        edge $e$ on the  path from $u$ to $v$ is mapped to a probability
        $p_e\in [0,1]$ through the logistic function:
        \begin{displaymath}
            p_e \eqdef \frac{1}{1+e^{-\frac{w_e}{\lambda}}}
        \end{displaymath}
        Note that the probability of structural propagation both \emph{(a)}
        decreases as the distance between $u$ and $v$ in the co-offending
        network increases, and \emph{(b)} increases as the weights of the edges
        on the path from $u$ to $v$ increase. The structural parameter
        $\lambda$ controls how quickly the probability $p_e$ of a given edge
        converges to one as the weight $w_e$ increases.

    \item \emph{temporal propagation:} if the structural propagation occurs,
        then the propagation time $\Delta_{u,v}$ between $u$ and $v$ is drawn
        from a continuous distribution with support in $\R_+$. We denote by
        $f(t)$ the density function of this probability distribution and by
        $F(t)$ its cumulative function. Table TODO gives the expression of $f$
        and $F$ for the different temporal distributions considered in this
        work.

        In our data, the infection times are only observed with a temporal
        resolution of one day. As a consequence, for two nodes $u$ and $v$
        whose \emph{observed} infection times are respectively $t_u$ and $t_v$
        (days), the \emph{real} propagation time could be anything in the
    interval $(t_v-t_u-1, t_v-t_u]$. Hence, the probability of observing
    a propagation time $\Delta_{u,v} = t_v - t_u$ is given by:
    \begin{equation}\label{eq:temporal}
        p_{u,v}^t(\Delta_{u,v}) \eqdef \int_{\Delta_{u,v}-1}^{\Delta_{u,v}} f(t)dt
    \end{equation}
\end{enumerate}

\paragraph{Successful propagation.} For two nodes $u$ and $v$ with observed
infection times $t_u$ and $t_v$, the probability $p_{u,v}$ that $u$ infected
$v$ is obtained by multiplying the structural and temporal probabilities given
by \eqref{eq:structural} and \eqref{eq:temporal}:
\begin{displaymath}
    p_{u,v}(t_u, t_v)  \eqdef p_{u,v}^s\cdot p_{u,v}^t(\Delta_{u,v})
\end{displaymath}

\subsection{Failed Propagations and Spawning of New Cascades}
\label{sec:roots}

\paragraph{Failed propagation.}
When an edge fails to propagate the violence cascade, it could be either
because the structural propagation did not occur, or because the structural
propagation occurred, but the temporal propagation didn't occur in time.

For an edge $(u,v)$ with infection times $t_u$ and $t_v$, it follows directly
from Section~\ref{sec:edge} that structural propagation does not occur with
probability $1-p_{u,v}^s$.

If structural propagation occurs, but temporal propagation does not occur in
time, it means that node $v$ was infected at time $t_v$ (through other means)
\emph{before} $u$ succeeds in infecting $v$. That is, the propagation time was
larger than $\Delta_{u,v}$. This occurs with probability:
\begin{displaymath}
    \int_{\Delta_{u,v}}^{+\infty} f(t)dt = 1-F(\Delta_{u,v})
\end{displaymath}

Overall, the probability of a failed propagation over an edge $(u,v)$ is given
by:
\begin{displaymath}
    \tilde{p}_{u,v}(t_u, t_v)
    \eqdef 1 - p_{u,v}^s + p_{u,v}^s\left(1-F(\Delta_{u,v})\right)
    = 1 - p_{u,v}^s F(\Delta_{u,v})
\end{displaymath}

\paragraph{}For an edge $(u,v)$ where $u$ is a victim node and $v$ is a not
a victim, the situation is similar: either the structural propagation failed,
or it succeeded but the temporal propagation did not occur in time. In the
latter case, since we did not observe an infection at all for $v$, this means
that the propagation time between $u$ and $v$ was such that the infection time
of $v$ would have fallen after the end of the study period. Denoting by $T$ the
observation horizon (the end of the study period), the probability of a failed
propagation for an edge $(u,v)$ where $v$ is not a victim is:
\begin{displaymath}
    \tilde{p}_{u,v}(t_u, T) \eqdef 1-p_{u,v}^s F(T-t_u)
\end{displaymath}

\paragraph{Spawning of New Cascades.}

Our probabilistic model considers another cause of infection for victim nodes:
beyond pairwise propagation of the violence cascade, nodes can also
spontaneously become infected and spawn a new violence cascade. We simply model
this by a Bernouilli variable of parameter $\beta$: for any node, the
probability that it becomes infected spontaneously is $\beta$.

\subsection{The Directed Acyclic Vector of Infection}
\label{sec:dag}

Given a vector of infection times denoted by $\textbf{t}$, it follows from
Sections~\ref{sec:edge} and \ref{sec:roots} that \emph{(1)} no infection can
occur between two non victim nodes, \emph{(2)} no infection can occur along an
edge $(u,v)$ where $t_u > t_v$.

This suggests to orient and trim the co-offending network $G$ to construct the
following directed graph $D_{\mathbf{t}}$ by applying the following rules:
\begin{enumerate}
    \item remove edges between non-victim nodes.
    \item orient edges from victim nodes to non-victim nodes.
    \item for any edge $(u,v)$ between two victims, orient it in the direction
        from smallest infection time to largest infection time.
\end{enumerate}

The resulting graph $D_{\mathbf{t}}$ only contains edges along with the
propagation of violence can possibly occur. Furthermore the edges are oriented
in the direction of propagation.

It has been observed in (CITE) that $D$ is acyclic. For completeness we
reproduce their proof here and adapt it to account for non-victim nodes (which
do not exist in the original paper).

\begin{theorem}
    The graph $D_{\mathbf{t}}$ obtained from $G$ by applying rules 1. to 3. is
    acyclic.
\end{theorem}

\begin{proof}
    Assume by contradiction that $D$ contains a cycle $(u_1,\ldots, u_n)$ where
    $(u_1, u_2),\dots,(u_{n-1}, u_n), (u_n, u_1)$ are edges of $D$. Because of
    rule 1., all nodes $u_1,\dots, u_n$ are victim nodes.

    Applying rule 3. to all edges in the cycle implies that
    $t_1<t_2<\dots<t_n<t_1$. The two extremal terms of this chain of
    inequalities implie $t_1<t_1$ which is a contradiction.
\end{proof}

\subsection{Cascade Model}

Using Sections~\ref{sec:edge} to \ref{sec:dag}, we can now fully specify our
cascade model.

The violence propagation proceeds as follows: every time a node becomes
infected, it attempts to infect all its neighbors which are not infected yet
according to the pairwise model of Section~\ref{sec:edge}. In other words, at
a given time, a node is subjected to the ongoing infection attempts of all its
neighbors which have been infected in the past. Because of the temporal
component of the pairwise propagation probabilities, these attempts will
succeed at different times. The infection is attributed to the first neighbor
who succeeds.Furthermore, all the infection attempts are independent from each
other.  

\paragraph{Example.} Let us consider a node $v$ with $n$ neighbors $u_1,\dots,
u_n$. For simplicity, let us assume that the structural propagation
probabilities are the same and equal to one for all edges $(u_i, v)$. We also
assume that all neighbors of $v$ were infected at time $0$ and we use the
exponential temporal model with parameter $\alpha$. Then, the probability that
$v$ is infected at time larger than $t$ is given by:
\begin{displaymath}
    p_v(t) \eqdef \P\left[\min_{1\leq i\leq n} Z_i \geq t\right]
    = e^{-n\alpha t}
\end{displaymath}
where $Z_i$ is an exponential variable of parameter $\alpha$ and where we used
that the law of the minimum of exponential variables is also exponential with
parameter equal to the sum of the individual variables' parameters.

This example shows that the larger the neighborhood the more likely it is that
a node will be infected quickly. The parameter $\alpha n$ of the resulting
exponential variable captures the aggregated infection potential of the
neighborhood.

\paragraph{} Since the infection of a node $v$ is always attributed to the
first neighbor which succeeds in infecting it, a victim node
can have either one parent (when it is infected by it) or no parent (when it
spawns a cascade spontaneously). Because the graph $D$ along which the cascades
of violence occur is acyclic, it follows that the violence propagates along $D$
by forming trees.

Formally, if we denote by $(V, E)$ the vertices and edges of $D$, and we write
$V = R\cup T$ where $R$ are the victim nodes and $T$ the non-victim nodes
($R\cap T=\emptyset$), then \emph{the violence cascades form a random forest
spanning $R$ in $D$}.

\paragraph{Likelihood.} Given pairwise parameters $(\alpha, \lambda, \beta)$
and a random forest $\mathcal{T} = (T_1,\ldots, T_r)$, we can compute the
probability of observing the infection times given by the vector $\textbf{t}$:
\begin{equation}
    \label{eq:model}
    \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda]
    = \beta^r(1-\beta)^{n-r}\prod_{e\in E_{\mathcal{T}}} p_e
    \prod_{e\in E_{D_{\mathcal{\mathbf{t}}}}\setminus E_{\mathcal{T}}}
    \tilde{p}_e
\end{equation}
where $E_{\mathcal{T}}$ denotes the edge set of the forest $\mathcal{T}$ and
$E_{D_{\mathbf{t}}}$ denotes the edge set of the DAG $D_{\mathbf{t}}$ defined
in Section~\ref{sec:dag}.

\section{Maximum Likelihood Inference}

The probabilistic model \eqref{eq:model} defines the joint probability of
a pair $\mathbf{t}, \mathcal{T}$ of infection times and cascades. However, we
are interested in situations where only the infection times are observed.

A direct application of maximum likelihood estimation would recover the
parameters $\beta, \alpha$ and $\lambda$ by solving the following optimization
program:
\begin{displaymath}
    \max_{\alpha,\beta,\lambda}\sum_{\mathcal{T}}
    \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda]
\end{displaymath}
However, since we are also interested in recovering the infection edges, we
solve the following optimization program instead:
\begin{equation}
    \label{eq:program}
    \mathcal{T}^*, \beta^*, \alpha^*, \lambda^* \in
    \argmax_{\mathcal{T},\beta,\alpha,\lambda}
    \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda]
\end{equation}
When solving this optimization program, the optimal forest $\mathcal{T}^*$ will
indicate the most likely edges along which the infection could have propagated,
while $\beta$, $\alpha$ and $\lambda$ will give us information about the
temporal and structural scale of pairwise infection.

\subsection{Finding the Cascades}

We first assume that $\beta$, $\alpha$ and $\lambda$ are fixed and solve
\eqref{eq:program} for $\mathcal{T}$:
\begin{equation}\label{eq:program2}
    \mathcal{T}^*\in\argmax_{\mathcal{T}}
    \beta^r(1-\beta)^{n-r}\prod_{e\in E_{\mathcal{T}}} p_e
    \prod_{e\in E_{D_{\mathcal{\mathbf{t}}}}\setminus E_{\mathcal{T}}}
    \tilde{p}_e
\end{equation}
Note that the solution $\mathcal{T}^*$ depends on the value of $(\beta, \alpha,
\lambda)$. We will then optimize over these parameters in the following
sections.

By multiplying and dividing each term by $\tilde{p}_e$ in the first product, we
can rewrite the objective function:
\begin{displaymath}
    \mathcal{T}^*\in\argmax_{\mathcal{T}}
    \left(\frac{\beta}{1-\beta}\right)^r(1-\beta)^n
    \prod_{e\in E_{\mathcal{T}}} \frac{p_e}{\tilde{p}_e}
    \prod_{e\in E_{D_{\mathcal{\mathbf{t}}}}} \tilde{p}_e
\end{displaymath}
Note that now, the last product, as well as the $(1-\beta)^n$ do not depend on
the choice of $\mathcal{T}$ and can be dropped without changing the
optimization program. Then, by taking the $\log$, we obtain the following
equivalent optimization program:
\begin{displaymath}
    \mathcal{T}^*\in\argmax_{\mathcal{T}}\; r\log\frac{\beta}{1-\beta}
    +\sum_{e\in E_{\mathcal{T}}} \log\frac{p_e}{\tilde{p}_e}
\end{displaymath}

As mentioned in Section XXX, $\mathcal{T}$ is a random forest spawning $R$, the
set of victim nodes, in $D_{\mathbf{t}}$. So choosing $\mathcal{T}$ amounts to
choosing for each node in $R$:
\begin{itemize}
    \item either to make it the root of a new tree, in which case it will
        contribute a term $\log\frac{\beta}{1-\beta}$ to the objective
        function.
    \item either to make it part of an existing tree, by including one of its
        incoming edge $e$ in the edge set $E_{\mathcal{T}}$. In this case, the
        node contributes a term $\log\frac{p_e}{\tilde{p}_e}$ to the objective
        function.
\end{itemize}

Since our goal is to solve \eqref{eq:program2}, in the second case, it is
sufficient to consider the incoming edge which maximizes the quantity
$\frac{p_e}{\tilde{p}_e}$. For this edge $e$, deciding between the two cases
then amounts to comparing $\frac{p_e}{\tilde{p}_e}$ to $\frac{\beta}{1-\beta}$.
This leads to a simple thresholding algorithm for solving \eqref{eq:program2}. 

\begin{algorithm}
    \caption{Thresholding algorithm for cascade recovery}
    \begin{algorithmic}[1]
    \Require graph $G$, infection times $\mathbf{t}$, $\alpha$, $\beta$, $\lambda$
    \Ensure cascades $\mathcal{T}$
    \State $D_{\mathbf{t}}\gets$ acyclic graph obtained by trimming $G$
    \State $R\gets\{u\in G: t_u < \infty\}$
    \State $E\gets\emptyset$
    \For{$v\in R$}
    \State $u\gets\argmax\left\{\frac{p_{u,v}}{\tilde{p}_{u,v}}: (u,v)\in
    D_t\right\}$
    \If{$\frac{p_{u,v}}{\tilde{p}_{u,v}}\geq \frac{\beta}{1-\beta}$}
    \State $E\gets E\cup \{(u,v)\}$
    \EndIf
    \EndFor
    \State \textbf{return} $\mathcal{T} = (R, E)$
    \end{algorithmic}
\end{algorithm}

\subsection{Finding the Spawning Parameter}

In this section, we consider the value of $\alpha$ and $\lambda$ fixed and try
to optimize \eqref{eq:program} jointly over $\mathcal{T}$ and $\beta$.

For a node $v\in R$, let us define:
\begin{displaymath}
    q_v \eqdef \max\left\{\frac{p_{u,v}}{\tilde{p}_{u,v}}:(u,v)\in E_{\mathbf{t}}\right\}
\end{displaymath}
and denote by $E_{\beta}$ the set:
\begin{displaymath}
    R_{\beta} \eqdef \left\{v\in R:q_v\geq \frac{\beta}{1-\beta}\right\}
\end{displaymath}
Then it follows from the analysis of Section XXX, that for a fixed value of
$\beta$, the optimal value of \eqref{eq:program} when optimizing over
$\mathcal{T}$ is given by:
\begin{displaymath}
    f(\beta) = (|R|-|R_\beta|)\log\frac{\beta}{1-\beta}
    + n\log(1-\beta)+ \sum_{v\in R_\beta} \log q_v
\end{displaymath}
Hence our goal is to find the maximum of $f(\beta)$ over the interval $(0,1)$.
Note that the maximum exists since $f$ diverges to $-\infty$ as $\beta$
approaches $0$ or $1$. Furthermore $f$ is continuous over $(0,1)$ since we can
rewrite it:
\begin{displaymath}
    f(\beta) = n\log(1-\beta) + \sum_{v\in R}
    \log\max\left\{\frac{\beta}{1-\beta}, q_v\right\}
\end{displaymath}

\subsection{Finding the Pairwise Propagation Parameters}

\end{document}