From 4401de9b38b4169efa71ad47c81774e74a2fe3da Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Thu, 18 Jun 2015 09:58:12 -0700 Subject: Some notes --- theory/main.tex | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 theory/main.tex (limited to 'theory/main.tex') diff --git a/theory/main.tex b/theory/main.tex new file mode 100644 index 0000000..d293b6f --- /dev/null +++ b/theory/main.tex @@ -0,0 +1,132 @@ +\documentclass[10pt]{article} +\usepackage[utf8x]{inputenc} +\usepackage{amsmath,amsfonts,amsthm} +\usepackage[english]{babel} +\usepackage[capitalize, noabbrev]{cleveref} +\usepackage{algorithm} +\usepackage{algpseudocode} + +\newenvironment{enumerate*}% + {\vspace{-2ex} \begin{enumerate} % + \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% + {\end{enumerate}} + +\newenvironment{itemize*}% + {\vspace{-2ex} \begin{itemize} % + \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% + {\end{itemize}} + +\newenvironment{description*}% + {\vspace{-2ex} \begin{description} % + \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% + {\end{description}} + +\DeclareMathOperator{\E}{\mathbb{E}} +\let\P\relax +\DeclareMathOperator{\P}{\mathbb{P}} +\newcommand{\ex}[1]{\E\left[#1\right]} +\newcommand{\prob}[1]{\P\left[#1\right]} +\newcommand{\inprod}[1]{\left\langle #1 \right\rangle} +\newcommand{\R}{\mathbf{R}} +\newcommand{\N}{\mathbf{N}} +\newcommand{\C}{\mathcal{C}} +\newcommand{\eps}{\varepsilon} +\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}} +\newcommand{\llbracket}{[\![} + +\newtheorem{theorem}{Theorem} +\newtheorem{lemma}{Lemma} +\algrenewcommand\algorithmicensure{\textbf{Output:}} +\algrenewcommand\algorithmicrequire{\textbf{Input:}} + +\author{} +\title{} +\date{} + +\begin{document} + +\section{Probabilistic Model} +During the last decade, a growing body of work has focused on the graph +inference problem: how can an unknown graph be recovered from the observation +of cascades propagating along it? (CITE). While our probabilistic model for +cascades of violence strongly relies on this line of work, we depart from it in +the following ways: +\begin{itemize} + \item in our case, the graph along which the cascades of violence propagate + (the co-offending network) is known, and we are only interested in + recovering (and ultimately predicting) the edges of the cascade, that + is, identifying who influenced whom. + \item the previous models treat multiple cascades as independent + observations which occur sequentially. In our case, cascades can die + and spawn at all time and temporally overlap. This need to be carefully + accounted for in our generative model and the resulting inference + algorithms. +\end{itemize} + +\subsection{Pairwise Propagation} + +For a pair $(u,v)$ of nodes, we model the propagation of the cascade +from $u$ to $v$ as the sequence of the following two probabilistic events: +\begin{enumerate} + + \item \emph{structural propagation:} a Bernouilli variable dictates whether + or not a propagation occurs. The parameter of this Bernouilli variable + is computed from the edge weights of the co-offending network as + follows: + \begin{equation}\label{eq:structural} + p_{u,v}^s \eqdef \prod_{e\in P(u,v)} + \frac{1}{1+e^{-\frac{w_e}{\lambda}}} + \end{equation} + where $P(u,v)$ denotes the shortest path from $u$ to $v$ and $\lambda$ + is the \emph{structural parameter}. That is, the weight $w_e$ of each + edge $e$ on the path from $u$ to $v$ is mapped to a probability + $p_e\in [0,1]$ through the logistic function: + \begin{displaymath} + p_e \eqdef \frac{1}{1+e^{-\frac{w_e}{\lambda}}} + \end{displaymath} + Note that the probability of structural propagation both \emph{(a)} + decreases as the distance between $u$ and $v$ in the co-offending + network increases, and \emph{(b)} increases as the weights of the edges + on the path from $u$ to $v$ increase. The structural parameter + $\lambda$ controls how quickly the probability $p_e$ of a given edge + converges to one as the weight $w_e$ increases. + + \item \emph{temporal propagation:} if the structural propagation occurs, + then the propagation time $\Delta_{u,v}$ between $u$ and $v$ is drawn + from a continuous distribution with support in $\R_+$. We denote by + $f(t)$ the density function of this probability distribution and by + $F(t)$ its cumulative function. Table TODO gives the expression of $f$ + and $F$ for the different temporal distributions considered in this + work. + + In our data, the infection times are only observed with a temporal + resolution of one day. As a consequence, for two nodes $u$ and $v$ + whose \empb{observed} infection times are respectively $t_u$ and $t_v$ + (days), the \emph{real} propagation time could be anything in the + interval $(t_v-t_u-1, t_v-t_u]$. Hence, the probability of observing + a propagation time $\Delta_{u,v} = t_v - t_u$ is given by: + \begin{equation}\label{eq:temporal} + p_{u,v}^t \eqdef \int_{\Delta_{u,v}-1}^{\Delta_{u,v}} f(t)dt + \end{equation} + +\paragraph{Successful propagation.} For two nodes $u$ and $v$ with observed +infection times $t_u$ and $t_v$, the probability $p_{u,v}$ that $u$ infected +$v$ is obtained by multiplying the structural and temporal probabilities given +by \eqref{eq:structural} and \eqref{eq:temporal}: +\begin{displaymath} + p_{u,v} \eqdef p_{u,v}^s\cdot p_{u,v}^t +\end{displaymath} + +\subsection{Spawning of New Cascades} + +\subsection{Cascade Model} + +\section{Maximum Likelihood Inference} + +\subsection{Finding the Cascades} + +\subsection{Finding the Spawning Parameter} + +\subsection{Finding the Pairwise Propagation Parameters} + +\end{document} -- cgit v1.2.3-70-g09d2 From b84661e2ee1b1931fa48068dbf723056202be860 Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Fri, 19 Jun 2015 21:40:26 -0700 Subject: First draft of the (almost) full specification of the probabilistic model --- theory/main.tex | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 143 insertions(+), 4 deletions(-) (limited to 'theory/main.tex') diff --git a/theory/main.tex b/theory/main.tex index d293b6f..5d66bbd 100644 --- a/theory/main.tex +++ b/theory/main.tex @@ -64,6 +64,7 @@ the following ways: \end{itemize} \subsection{Pairwise Propagation} +\label{sec:edge} For a pair $(u,v)$ of nodes, we model the propagation of the cascade from $u$ to $v$ as the sequence of the following two probabilistic events: @@ -101,26 +102,164 @@ from $u$ to $v$ as the sequence of the following two probabilistic events: In our data, the infection times are only observed with a temporal resolution of one day. As a consequence, for two nodes $u$ and $v$ - whose \empb{observed} infection times are respectively $t_u$ and $t_v$ + whose \emph{observed} infection times are respectively $t_u$ and $t_v$ (days), the \emph{real} propagation time could be anything in the interval $(t_v-t_u-1, t_v-t_u]$. Hence, the probability of observing a propagation time $\Delta_{u,v} = t_v - t_u$ is given by: \begin{equation}\label{eq:temporal} - p_{u,v}^t \eqdef \int_{\Delta_{u,v}-1}^{\Delta_{u,v}} f(t)dt + p_{u,v}^t(\Delta_{u,v}) \eqdef \int_{\Delta_{u,v}-1}^{\Delta_{u,v}} f(t)dt \end{equation} +\end{enumerate} \paragraph{Successful propagation.} For two nodes $u$ and $v$ with observed infection times $t_u$ and $t_v$, the probability $p_{u,v}$ that $u$ infected $v$ is obtained by multiplying the structural and temporal probabilities given by \eqref{eq:structural} and \eqref{eq:temporal}: \begin{displaymath} - p_{u,v} \eqdef p_{u,v}^s\cdot p_{u,v}^t + p_{u,v}(t_u, t_v)  \eqdef p_{u,v}^s\cdot p_{u,v}^t(\Delta_{u,v}) \end{displaymath} -\subsection{Spawning of New Cascades} +\subsection{Failed Propagations and Spawning of New Cascades} +\label{sec:roots} + +\paragraph{Failed propagation.} +When an edge fails to propagate the violence cascade, it could be either +because the structural propagation did not occur, or because the structural +propagation occurred, but the temporal propagation didn't occur in time. + +For an edge $(u,v)$ with infection times $t_u$ and $t_v$, it follows directly +from Section~\ref{sec:edge} that structural propagation does not occur with +probability $1-p_{u,v}^s$. + +If structural propagation occurs, but temporal propagation does not occur in +time, it means that node $v$ was infected at time $t_v$ (through other means) +\emph{before} $u$ succeeds in infecting $v$. That is, the propagation time was +larger than $\Delta_{u,v}$. This occurs with probability: +\begin{displaymath} + \int_{\Delta_{u,v}}^{+\infty} f(t)dt = 1-F(\Delta_{u,v}) +\end{displaymath} + +Overall, the probability of a failed propagation over an edge $(u,v)$ is given +by: +\begin{displaymath} + \tilde{p}_{u,v}(t_u, t_v) + \eqdef 1 - p_{u,v}^s + p_{u,v}^s\left(1-F(\Delta_{u,v})\right) + = 1 - p_{u,v}^s F(\Delta_{u,v}) +\end{displaymath} + +\paragraph{}For an edge $(u,v)$ where $u$ is a victim node and $v$ is a not +a victim, the situation is similar: either the structural propagation failed, +or it succeeded but the temporal propagation did not occur in time. In the +latter case, since we did not observe an infection at all for $v$, this means +that the propagation time between $u$ and $v$ was such that the infection time +of $v$ would have fallen after the end of the study period. Denoting by $T$ the +observation horizon (the end of the study period), the probability of a failed +propagation for an edge $(u,v)$ where $v$ is not a victim is: +\begin{displaymath} + \tilde{p}_{u,v}(t_u, T) \eqdef 1-p_{u,v}^s F(T-t_u) +\end{displaymath} + +\paragraph{Spawning of New Cascades.} + +Our probabilistic model considers another cause of infection for victim nodes: +beyond pairwise propagation of the violence cascade, nodes can also +spontaneously become infected and spawn a new violence cascade. We simply model +this by a Bernouilli variable of parameter $\beta$: for any node, the +probability that it becomes infected spontaneously is $\beta$. + +\subsection{The Directed Acyclic Vector of Infection} +\label{sec:dag} + +It follows from Sections~\ref{sec:edge} and \ref{sec:roots} that \emph{(1)} no +infection can occur between two non victim nodes, \emph{(2)} no infection can +occur along an edge $(u,v)$ where $t_u > t_v$. + +This suggests to orient and trim the co-offending network $G$ to construct the +following directed graph $D$ by applying the following rules: +\begin{enumerate} + \item remove edges between non-victim nodes. + \item orient edges from victim nodes to non-victim nodes. + \item for any edge $(u,v)$ between two victims, orient it in the direction + from smallest infection time to largest infection time. +\end{enumerate} + +The resulting graph $D$ only contains edges along with the propagation of +violence can possibly occur. Furthermore the edges are oriented in the +direction of propagation. + +It has been observed in (CITE) that $D$ is acyclic. For completeness we +reproduce their proof here and adapt it to account for non-victim nodes (which +do not exist in the original paper). + +\begin{theorem} +The graph $D$ obtained from $G$ by applying rules 1. to 3. is acyclic. +\end{theorem} + +\begin{proof} + Assume by contradiction that $D$ contains a cycle $(u_1,\ldots, u_n)$ where + $(u_1, u_2),\dots,(u_{n-1}, u_n), (u_n, u_1)$ are edges of $D$. Because of + rule 1., all nodes $u_1,\dots, u_n$ are victim nodes. + + Applying rule 3. to all edges in the cycle implies that + $t_1