\documentclass[10pt]{article} \usepackage[utf8x]{inputenc} \usepackage{amsmath, amssymb, amsthm, microtype} \usepackage{algpseudocode, algorithm, algorithmicx} \usepackage[pagebackref=false,breaklinks=true, colorlinks=true,citecolor=blue]{hyperref} \usepackage[capitalize, noabbrev]{cleveref} \usepackage{graphicx, subfig} \usepackage{bbm} \usepackage{fullpage} \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator{\E}{\mathbb{E}} \let\P\relax \DeclareMathOperator{\P}{\mathbb{P}} \newcommand{\ex}[1]{\E\left[#1\right]} \newcommand{\prob}[1]{\P\left[#1\right]} \newcommand{\inprod}[1]{\left\langle #1 \right\rangle} \newcommand{\R}{\mathbf{R}} \newcommand{\N}{\mathbf{N}} \newcommand{\C}{\mathcal{C}} \newcommand{\eps}{\varepsilon} \newcommand{\bt}{\boldsymbol{\theta}} \newcommand{\bx}{\mathbf{x}} \newcommand{\cl}[1]{\text{\textbf{#1}}} \newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}} \newtheorem{theorem}{Theorem} \newtheorem{proposition}[theorem]{Proposition} \newtheorem{lemma}[theorem]{Lemma} \newtheorem{corollary}[theorem]{Corollary} \theoremstyle{definition} \newtheorem{definition}[theorem]{Definition} \theoremstyle{remark} \newtheorem*{example}{Example} \newtheorem*{remark}{Remark} \title{Regression Analysis with Network Data} \author{Thibaut Horel \and Jean Pouget-Abadie} \begin{document} \maketitle \begin{abstract} \end{abstract} \section{Introduction} Central question: \emph{relating properties of the graph to the difficulty of doing graph inference from cascades.} Two subquestions: \begin{itemize} \item which properties of the graph make it impossible or hard to learn (identifiability, convergence rate)? \item which properties of the graph can be exploited to learn better or faster (use a Bayesian prior)? \end{itemize} \section{Model} Weighted directed graph $G = (V, \Theta)$. $k=|V|$ is the number of nodes. $\Theta\in\R_{+}^{k\times k}$. $\Theta$ implicitly defines the edge set $E$ of the graph. Discrete time model, $t\in\N$. Nodes can be in one of two states: \emph{susceptible}, \emph{infected} or \emph{immune}. $S_t:$ nodes susceptible at the beginning of time step $t\in\N$. $I_t$ nodes infected at time step $t$. \begin{displaymath} S_0 = V,\quad S_{t+1} = S_t \setminus I_t \end{displaymath} Nodes which are no longer susceptible are immune. The dynamics of $I_t$ is described by a random Markovian process. Let us denote by $x_t$ the indicator vector of $I_t$ at time step $t\in\N$. $x_0$ is drawn from the \emph{source distribution} $p_s:\{0,1\}^n\to[0,1]$. For $t\geq 1$, Markovian process: \begin{equation} \label{eq:markov} \forall i\in S_t,\quad \P\big(x_{i}^{t} = 1\,|\, x^{t-1}\big) = f(\bt_i\cdot x^{t-1}) \end{equation} $\bt_i$ is the $i$th column of $\Theta$, $f:\R\to[0,1]$, plus independence for $i$. A cascade continues until no more infected nodes. \section{Identifiability} A given source distribution $p_s$ and \cref{eq:markov} completely specifies the distribution $p$ of a cascade $\mathbf{x} = (x_t)_{t\geq 0}$: \begin{displaymath} p(\bx;\Theta) = p_s(x^0)\prod_{t\geq 1}\prod_{i\in S_t} f(\bt_i\cdot x^{t-1})^{x^t_i}\big(1-f(\theta_i\cdot x^{t-1})\big)^{1-x_i^t} \end{displaymath} We see this model as parametrized by $\Theta$. $f$ and $p_s$ are considered fixed. \begin{definition} We say that the model $\mathcal{P} = \big\{p(\cdot\,;\,\Theta)\,|\,\Theta\in\R_+^{k\times k}\big\}$ is identifiable iff: \begin{displaymath} \forall \Theta,\Theta'\in\R_+^{k\times k},\quad p(\cdot\,;\,\Theta) = p(\cdot\, ;\, \Theta') \Rightarrow \Theta = \Theta' \end{displaymath} \end{definition} In our case, identifiability is not guaranteed. In fact there are two factors which can lead to unidentifiability: lack of information or ambiguity. We illustrate both factors in the subsequent two examples. \begin{figure} \begin{center} \includegraphics[scale=0.8]{example.pdf} \end{center} \caption{Example of a graph which can be unidentifiable, both because of lack of information or parent ambiguity if the source distribution is chosen adversarially.} \label{fig:ident} \end{figure} \vspace{.5em} \begin{example}[Lack of information] Let us consider the graph in \cref{fig:ident} and the source distribution $p_s$ defined by $p_s\big((0, 1, 0)\big) = 1$. In other words, node $2$ is always the only infected node at $t=0$. Then it is clear that all casacades die at $t=1$ at which node 3 becomes infected with probability $f(\theta_{2,3})$. This probability does not depend on $\theta_{1,3}$. Hence all matrices $\Theta$ compatible with the graph and for which $\theta_{2,3}$ is fixed yield the same cascade distribution. \end{example} \vspace{.5em} \begin{example}[Parent ambiguity] Let us consider again the graph in \cref{fig:ident} and let us consider the case where the source distribution is such that $p_s\big((1,1,0)\big) = 1$, \emph{i.e} nodes 1 and 2 are always jointly infected at time $t=0$. Then it is clear that cascades will all die at time $t=1$ where node 3 becomes infected with probability $f(\theta_{1,3} + \theta_{2,3})$. This implies that all matrices $\Theta$ which are compatible with the graph in \cref{fig:ident} (defining the same edge set) and for which $\theta_{1,3} + \theta_{2,3} = c$ for some $c\in \R_+$ define the same cascade probability distribution. \end{example} \vspace{.5em} The examples show that for a model to be identifiable, we need conditions on the source distribution, in particular in relation to how well it plays with the reachability structure of the graph. We will abuse the notation $p_s$ and for $u\in V$ write: \begin{displaymath} p_s(w) \eqdef \sum_{\bx: \bx_w = 1} p_s(\bx) \end{displaymath} Furthermore we write $u\leadsto v$ if there is a directed path from $u$ to $v$ in $G$ and $u\leadsto v\leadsto w$ if there is a directed path from $u$ to $w$ passing through $v$ in $G$. \begin{proposition} Under the assumptions: \begin{enumerate} \item $\forall z\in \mathbf{\R_+},\; f(z)< 1$ \item for all $S\subseteq V$ with $|S|\geq 2$, $p_s(S)<1$. \item $\forall (u,v)\in V^2$, there exists $w$ such that $p_s(w)\neq 0$ and $w\leadsto u\leadsto v$. \end{enumerate} then the model is identifiable. \end{proposition} 2. prevents parent ambiguity at the source. 1. prevents parent ambiguity from occurring later on in the cascade even when there is no ambiguity at the source (could be relaxed to ``absence of forks''). 3. prevents lack of information. \begin{proof} \end{proof} \begin{remark} The conditions are the weakest under which there is identifiability (we can show that they are necessary and sufficient). Reasonable source distributions which imply the conditions are uniform random, independent Bernouilli. \end{remark} \section{Inference} \subsection{Maximum-Likelihood Estimation} Because transitions occur independently for each node $i$, the log-likelihood is a sum of $k$ terms, each term $i$ only depending on $\bt_i$. Hence, each $\bt_i$ can be learnt separately by solving a node-specific optimization problem: We assume $f$ log-concave to have a concave optimization problem. Depending on the model, $\theta_{i,j}$ can either be restricted to live in a compact subset of $\R_+$. Otherwise, we need to assume that $\lim_{z\to\infty} f(z) = 1$. Then, we can apply standard results on consistency and asymptotic normality of MLE estimator. \subsection{Bayesian Approach} \paragraph{Notation} Let's adopt a Bayesian approach to the previous problem. To fix the notation introduced above, we let $G=(V, \Theta)$ be a weighted directed graph with $\Theta \in \R_+^{k \times k}$. We place a prior $\mathcal{D}$ on $\Theta$. Let $x_0$ be a source node, sampled from distribution $p_s$ over the vertices $V$ of the graph $\mathcal{G}$. Since we assume for now that the state of the source node is entirely observed, the choice of $p_s$ is not relevant to the inference of $\Theta$. Finally, we let $x_t$ (resp. $\mathbbm{1}_{S_t}$) be the binary vectors encoding the infected (resp.~susceptible) nodes at time $t$: \begin{alignat*}{2} \theta & \sim \mathcal{D}\qquad &&\text{(prior on $\theta$)} \\ x_0 & \sim p_s &&\text{(random source distribution)}\\ \forall t\geq 0, x_{t+1} | x_t, \theta &\sim Ber(f(\theta\cdot x_t)\cdot \mathbbm{1}_{S_t}) &&\text{(cascade process)} \end{alignat*} \paragraph{Graph prior} Prior work has focused on MLE estimation regularized using lasso, summarized in the section above. This is equivalent to choosing an independent Laplace prior for $\theta$: $$(\Theta_{i,j})\sim \prod_{i,j} Laplace(0,1)$$ In the context of social network learning however, we can place more informative priors. We can: \begin{itemize} \item Take into account prior knowledge of edges' existence (or lack thereof) \item Take into account common graph structures, such as triangles, clustering \end{itemize} A common prior for graph is the ERGM model~\cite{}, defined by feature vector $s(G)$ and by the probability distribution: $$P(G | \Theta) \propto \exp \left( s(G)\cdot \Theta \right)$$ \paragraph{Inference} We can sample from the posterior by MCMC\@. This might not be the fastest solution however. We could greatly benefit from using an alternative method: \begin{itemize} \item EM\@. This approach was used in \cite{linderman2014discovering} to learn the parameters of a Hawkes process, a closely related inference problem. \item Variational Inference. This approach was used in~\cite{linderman2015scalable} as an extension to the paper cited in the previous bullet point. \end{itemize} \begin{figure} \subfloat[][50 cascades]{ \includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:52:30.pdf}} \subfloat[][100 cascades]{ \includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:52:47.pdf}}\\ \subfloat[][150 cascades]{ \includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:53:24.pdf}} \subfloat[][200 cascades]{ \includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:55:39.pdf}}\\ \subfloat[][250 cascades]{ \includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:57:26.pdf}} \subfloat[][1000 cascades]{ \includegraphics[scale=.4]{../simulation/plots/2015-11-05_22:58:29.pdf}} \caption{Bayesian Inference of $\Theta$ with MCMC using a $Beta(1, 1)$ prior. For each figure, the plot $(i, j)$ on the $i^{th}$ row and $j^{th}$ column represent a histogram of samples taken from the posterior of the corresponding edge $\Theta_{i, j}$. The red line indicates the true value of the edge weight. If an edge does not exist (has weight $0$) the red line is confounded with the y axis.} \label{betapriorbayeslearning} \end{figure} \paragraph{Experiments} We ran some experiments on a simple network with 4 nodes with $\binom{4}{2}=6$ parameters to learn with the MCMC package PyMC\@. We plot in Figure~\ref{betapriorbayeslearning} the progressive learning of $\Theta$ for increasing numbers of observations. Of note, since the IC model does not include self-loops, the diagonal terms are never properly learned, which is expected but not undesirable. We notice that the existence or not of an edge is (relatively) quickly learned, with the posterior on edges with no weight converging to $0$ after $100$ cascades. To get a concentrated posterior around the true non-zero edge weigth requires $1000$ cascades, which is unreasonably high considering the small number of parameters that we are learning in this experiment. \subsection{Active Learning} In this setup, $S$ is no longer drawn from a random distribution $p_s$ but is chosen by the designer of the experiment. Once a source is drawn, a cascade is drawn from the Markovian process, $\mathcal{D'}$. We wish to choose the source which maximises the information gain on the parameter $\Theta$, so as to speed up learning. We suggest to choose the source which maximises the mutual information between $\theta$ and $X$ at each time step. The mutual information can be computed node-by-node by sampling: \begin{equation*} I((x_t)_{t\geq 1} ,\Theta | x_0 = i) = \mathbb{E}_{\substack{\Theta \sim D_t \\ x | \Theta, i \sim D'}}\log p(x|\Theta) - \mathbb{E}_{\substack{\Theta \sim D_t \\ x' | \Theta, i \sim D'}} \log p(x') \end{equation*} The second term involves marginalizing over $\Theta$: $p(x') = \mathbb{E}_{\Theta \sim D_t} p(x'| \Theta)$. \begin{algorithm} \caption{Active Bayesian Learning through Cascades (ABC)} \begin{algorithmic}[1] \State $\theta \sim \mathcal{D}_0 = \mathcal{D}$ \Comment{Initial prior on $\theta$} \While{True} \State $i \leftarrow \arg\min_{i} I(\theta, (x_t)_{t \geq 1} | x_0 = i)$ \Comment{Maximize mutual information} \State Observe $(x_t)_{t\geq1} |x_0 = i$ \Comment{Observe cascade from source} \State $\mathcal{D}_{t+1} \gets \text{posterior of $\theta \sim \mathcal{D}_t$ given $(x_t)_{t\geq1}$}$ \Comment{Update posterior on $\theta$} \EndWhile \end{algorithmic} \end{algorithm} \bibliography{sparse}{} \bibliographystyle{plain} \end{document}