aboutsummaryrefslogtreecommitdiffstats
path: root/finale
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2015-11-05 12:14:06 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2015-11-05 12:14:06 -0500
commite4752124605bfce12b953c06c13425472aacb811 (patch)
tree3faea4634beeb9020896fed3fb4de2411ddf159e /finale
parent15e4871fc224e9e74c93b772b15aea7031f262ab (diff)
downloadcascades-e4752124605bfce12b953c06c13425472aacb811.tar.gz
Notes for the mid project report, WIP
Diffstat (limited to 'finale')
-rw-r--r--finale/mid_report.tex205
1 files changed, 205 insertions, 0 deletions
diff --git a/finale/mid_report.tex b/finale/mid_report.tex
new file mode 100644
index 0000000..42a28c3
--- /dev/null
+++ b/finale/mid_report.tex
@@ -0,0 +1,205 @@
+\documentclass[10pt]{article}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath, amssymb, amsthm, microtype}
+\usepackage[pagebackref=false,breaklinks=true,colorlinks=true,citecolor=blue]{hyperref}
+\usepackage[capitalize, noabbrev]{cleveref}
+\usepackage{graphicx}
+\usepackage{bbm}
+
+\DeclareMathOperator*{\argmax}{arg\,max}
+\DeclareMathOperator{\E}{\mathbb{E}}
+\let\P\relax
+\DeclareMathOperator{\P}{\mathbb{P}}
+\newcommand{\ex}[1]{\E\left[#1\right]}
+\newcommand{\prob}[1]{\P\left[#1\right]}
+\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
+\newcommand{\R}{\mathbf{R}}
+\newcommand{\N}{\mathbf{N}}
+\newcommand{\C}{\mathcal{C}}
+\newcommand{\eps}{\varepsilon}
+\newcommand{\bt}{\boldsymbol{\theta}}
+\newcommand{\bx}{\mathbf{x}}
+\newcommand{\cl}[1]{\text{\textbf{#1}}}
+\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
+
+\newtheorem{theorem}{Theorem}
+\newtheorem{proposition}[theorem]{Proposition}
+\newtheorem{lemma}[theorem]{Lemma}
+\newtheorem{corollary}[theorem]{Corollary}
+\theoremstyle{definition}
+\newtheorem{definition}[theorem]{Definition}
+\theoremstyle{remark}
+\newtheorem*{example}{Example}
+\newtheorem*{remark}{Remark}
+
+\title{Regression Analysis with Network Data}
+\author{Thibaut Horel \and Jean Pouget-Abadie}
+
+\begin{document}
+
+\maketitle
+
+\begin{abstract}
+
+\end{abstract}
+
+\section{Introduction}
+
+Central question: \emph{relating properties of the graph to the difficulty of
+doing graph inference from cascades.}
+
+Two subquestions:
+\begin{itemize}
+ \item which properties of the graph make it impossible or hard to learn
+ (identifiability, convergence rate)?
+ \item which properties of the graph can be exploited to learn better or
+ faster (use a Bayesian prior)?
+\end{itemize}
+
+\section{Model}
+
+Weighted directed graph $G = (V, \Theta)$. $k=|V|$ is the number of nodes.
+$\Theta\in\R_{+}^{k\times k}$. $\Theta$ implicitly defines the edge set $E$ of
+the graph.
+
+Discrete time model, $t\in\N$. Nodes can be in one of two states:
+\emph{susceptible}, \emph{infected} or \emph{immune}. $S_t:$ nodes susceptible
+at the beginning of time step $t\in\N$. $I_t$ nodes infected at time step $t$.
+\begin{displaymath}
+ S_0 = V,\quad S_{t+1} = S_t \setminus I_t
+\end{displaymath}
+Nodes which are no longer susceptible are immune.
+
+The dynamics of $I_t$ is described by a random Markovian process. Let us denote
+by $x_t$ the indicator vector of $I_t$ at time step $t\in\N$. $x_0$ is drawn
+from the \emph{source distribution} $p_s:\{0,1\}^n\to[0,1]$. For $t\geq 1$,
+Markovian process:
+\begin{equation}
+ \label{eq:markov}
+ \forall i\in S_t,\quad
+ \P\big(x_{i}^{t} = 1\,|\, x^{t-1}\big) = f(\bt_i\cdot x^{t-1})
+\end{equation}
+$\bt_i$ is the $i$th column of $\Theta$, $f:\R\to[0,1]$, plus independence for
+$i$. A cascade continues until no more infected nodes.
+
+
+\section{Identifiability}
+
+A given source distribution $p_s$ and \cref{eq:markov} completely specifies the
+distribution $p$ of a cascade $\mathbf{x} = (x_t)_{t\geq 0}$:
+\begin{displaymath}
+ p(\bx;\Theta) = p_s(x^0)\prod_{t\geq 1}\prod_{i\in S_t}
+ f(\bt_i\cdot x^{t-1})^{x^t_i}\big(1-f(\theta_i\cdot x^{t-1})\big)^{1-x_i^t}
+\end{displaymath}
+
+We see this model as parametrized by $\Theta$. $f$ and $p_s$ are considered
+fixed.
+
+\begin{definition}
+ We say that the model $\mathcal{P}
+ = \big\{p(\cdot\,;\,\Theta)\,|\,\Theta\in\R_+^{k\times k}\big\}$ is
+ identifiable iff:
+ \begin{displaymath}
+ \forall \Theta,\Theta'\in\R_+^{k\times k},\quad
+ p(\cdot\,;\,\Theta) = p(\cdot\, ;\, \Theta') \Rightarrow \Theta = \Theta'
+ \end{displaymath}
+\end{definition}
+
+In our case, identifiability is not guaranteed. In fact there are two factors
+which can lead to unidentifiability: lack of information or ambiguity. We
+illustrate both factors in the subsequent two examples.
+
+\begin{figure}
+\begin{center}
+ \includegraphics[scale=0.8]{example.pdf}
+\end{center}
+\caption{Example of a graph which can be unidentifiable, both because of lack
+ of information or parent ambiguity if the source distribution is chosen
+adversarially.}
+\label{fig:ident}
+\end{figure}
+
+\vspace{.5em}
+
+\begin{example}[Lack of information]
+ Let us consider the graph in \cref{fig:ident} and the source distribution
+ $p_s$ defined by $p_s\big((0, 1, 0)\big) = 1$. In other words, node $2$ is
+ always the only infected node at $t=0$. Then it is clear that all
+ casacades die at $t=1$ at which node 3 becomes infected with probability
+ $f(\theta_{2,3})$. This probability does not depend on $\theta_{1,3}$.
+ Hence all matrices $\Theta$ compatible with the graph and for which
+ $\theta_{2,3}$ is fixed yield the same cascade distribution.
+\end{example}
+
+\vspace{.5em}
+
+\begin{example}[Parent ambiguity]
+Let us consider again the graph in \cref{fig:ident} and let us consider the
+case where the source distribution is such that $p_s\big((1,1,0)\big) = 1$,
+\emph{i.e} nodes 1 and 2 are always jointly infected at time $t=0$. Then it is
+clear that cascades will all die at time $t=1$ where node 3 becomes infected
+with probability $f(\theta_{1,3} + \theta_{2,3})$. This implies that all
+matrices $\Theta$ which are compatible with the graph in \cref{fig:ident}
+(defining the same edge set) and for which $\theta_{1,3} + \theta_{2,3} = c$
+for some $c\in \R_+$ define the same cascade probability distribution.
+\end{example}
+
+\vspace{.5em}
+
+The examples show that for a model to be identifiable, we need conditions on
+the source distribution, in particular in relation to how well it plays with
+the reachability structure of the graph.
+
+We will abuse the notation $p_s$ and for $u\in V$ write:
+\begin{displaymath}
+ p_s(w) \eqdef \sum_{\bx: \bx_w = 1} p_s(\bx)
+\end{displaymath}
+Furthermore we write $u\leadsto v$ if there is a directed path from $u$ to $v$
+in $G$ and $u\leadsto v\leadsto w$ if there is a directed path from $u$ to $w$
+passing through $v$ in $G$.
+
+\begin{proposition}
+Under the assumptions:
+\begin{enumerate}
+ \item $\forall z\in \mathbf{\R_+},\; f(z)< 1$
+ \item for all $S\subseteq V$ with $|S|\geq 2$, $p_s(S)<1$.
+ \item $\forall (u,v)\in V^2$, there exists $w$ such that $p_s(w)\neq 0$ and
+ $w\leadsto u\leadsto v$.
+\end{enumerate}
+then the model is identifiable.
+\end{proposition}
+
+2. prevents parent ambiguity at the source. 1. prevents parent ambiguity from
+ occurring later on in the cascade even when there is no ambiguity at the
+ source (could be relaxed to ``absence of forks''). 3. prevents lack of
+ information.
+
+\begin{proof}
+
+\end{proof}
+
+\begin{remark} The conditions are the weakest under which there is
+ identifiability (we can show that they are necessary and sufficient).
+ Reasonable source distributions which imply the conditions are uniform
+ random, independent Bernouilli.
+\end{remark}
+
+\section{Inference}
+
+\subsection{Maximum-Likelihood Estimation}
+
+Because transitions occur independently for each node $i$, the log-likelihood
+is a sum of $k$ terms, each term $i$ only depending on $\bt_i$. Hence, each
+$\bt_i$ can be learnt separately by solving a node-specific optimization
+problem:
+
+We assume $f$ log-concave to have a concave optimization problem. Depending on
+the model, $\theta_{i,j}$ can either be restricted to live in a compact subset
+of $\R_+$. Otherwise, we need to assume that $\lim_{z\to\infty} f(z) = 1$.
+Then, we can apply standard results on consistency and asymptotic normality of
+MLE estimator.
+
+\subsection{Bayesian Approach}
+
+
+\end{document}