\documentclass[10pt]{beamer} \usepackage{amssymb, amsmath, graphicx, amsfonts, color} \title{Estimating a Graph's Parameters from Cascades} \author{Jean (John) Pouget-Abadie \\ Joint Work with Thibaut (T-bo) Horel} \date{} \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Example} \begin{figure} \includegraphics[scale=.25]{../images/drawing.pdf} \caption{A network} \end{figure} \end{frame} \begin{frame} \frametitle{Example} \begin{figure} \includegraphics[scale=.25]{../images/noedges_step1.pdf} \caption{Cascade 1: $t=0$} \end{figure} \end{frame} \begin{frame} \frametitle{Example} \begin{figure} \includegraphics[scale=.25]{../images/noedges_step2.pdf} \caption{Cascade 1: $t=1$} \end{figure} \end{frame} \begin{frame} \frametitle{Example} \begin{figure} \includegraphics[scale=.25]{../images/noedges_step3.pdf} \caption{Cascade 1: $t=2$} \end{figure} \end{frame} \begin{frame} \frametitle{Example} \begin{figure} \includegraphics[scale=.25]{../images/noedges_step1_cascade2} \caption{Cascade 2: $t=0$} \end{figure} \end{frame} \begin{frame} \frametitle{Example} \begin{figure} \includegraphics[scale=.25]{../images/noedges_step2_cascade2} \caption{Cascade 2: $t=1$} \end{figure} \end{frame} \begin{frame} \frametitle{Example} \begin{figure} \includegraphics[scale=.25]{../images/noedges_step3_cascade2} \caption{Cascade 2: $t=2$} \end{figure} \end{frame} \begin{frame} \frametitle{Context} Notation: \begin{itemize} \item $({\cal G}, \theta)$ : graph, parameters \item Cascade: diffusion process of a `behavior' on $({\cal G}, \theta)$ \item $(X_t)_c$ : set of `active' nodes at time t for cascade $c$ \end{itemize} \begin{table} \begin{tabular}{c c} Graph $\implies$ Cascades & Cascades $\implies$ Graph \\ \hline $({\cal G}, \theta)$ is known & $(X_t)_c$ is observed \\ Predict $(X_t) | X_0$ & Recover $({\cal G}, \theta)$ \\ \end{tabular} \end{table} Summary: \begin{itemize} \item Many algorithms \emph{require} knowledge of $({\cal G}, \theta)$ \item {\bf Graph Inference} is the problem of \emph{learning} $({\cal G}, \theta)$ \end{itemize} \end{frame} \begin{frame} \begin{block}{Decomposability} Learning the graph $\Leftrightarrow$ Learning the parents of a single node \end{block} \end{frame} \begin{frame} \frametitle{Problem Statement} \begin{itemize} \pause \item Can we learn ${\cal G}$ from $(X_t)_c$? \pause \item How many cascades? How many steps in each cascade? \pause \item Can we learn $\theta$ from $(X_t)_c$? \pause \item How does the error decrease with $n_{\text{cascades}}$? \pause \item Are there graphs which are easy to learn? Harder to learn? \pause \item What kind of diffusion processes can we consider? \pause \item What is the minimal number of cascades required to learn $({\cal G}, \theta)$? \end{itemize} \end{frame} \begin{frame} \frametitle{Notation} \begin{itemize} \item n : number of measurements \item N : number of cascades \item m : number of nodes \item s : degree of node considered \end{itemize} \end{frame} \begin{frame} \frametitle{Related Work} \begin{itemize} \pause \item Can we learn ${\cal G}$ from $(X_t)_c$? \pause \\{\color{blue} Yes} \pause \item How many cascades? How many steps in each cascade? \pause \\ {\color{blue} poly(s)$ \log m$ cascades} \pause \item Can we learn $\theta$ from $(X_t)_c$? \pause \\ {\color{blue} (?)} \pause \item How does the error decrease with $n_{\text{cascades}}$? \pause \\ {\color{blue} (?)} \pause \item Are there graphs which are easy to learn? Harder to learn? \pause \\{\color{blue} Sparse Graphs are easy} \pause \item What kind of diffusion processes can we consider? \pause \\{\color{blue} IC Model (discrete and continuous)} \pause \item What is the minimal number of cascades required to learn $({\cal G}, \theta)$? \pause \\{\color{blue} (?)\dots$s \log m/s$ in specific setting} \end{itemize} \end{frame} \begin{frame} \frametitle{Our Work} \begin{itemize} \pause \item Can we learn ${\cal G}$ from $(X_t)_c$? \pause \\{\color{blue} Yes} $\rightarrow$ {\color{red} Yes} \pause \item How many cascades? How many steps in each cascade? \pause \\ {\color{blue} poly(s)$ \log m$ cascades} $\rightarrow$ {\color{red} $s\log m$ measurements} \pause \item Can we learn $\theta$ from $(X_t)_c$? \pause \\ {\color{blue} (?)} $\rightarrow$ {\color{red} Yes!} \pause \item How does the error decrease with $n_{\text{cascades}}$? \pause \\ {\color{blue} (?)} $\rightarrow$ {\color{red} ${\cal O}(\sqrt{s\log m/n})$} \pause \item Are there graphs which are easy to learn? Harder to learn? \pause \\ {\color{blue} Sparse Graphs are easy} $\rightarrow$ {\color{red} Approx. sparsity is also easy} \pause \item What kind of diffusion processes can we consider? \pause \\ {\color{blue} IC Model (discrete, continuous)} $\rightarrow$ {\color{red} Large class of Cascade Models} \pause \item What is the minimal number of cascades required to learn $({\cal G}, \theta)$? \pause \\{\color{blue} $s \log m/s$ in specific setting} $\rightarrow$ {\color{red} $s \log m/s$ for approx. sparse graphs} \end{itemize} \end{frame} \begin{frame} \frametitle{Voter Model} \begin{itemize} \pause \item {\color{red} Red} and {\color{blue} Blue} nodes. At every step, each node $i$ chooses one of its neighbors $j$ with probability $p_{j,i}$ and adopts that color at $t+1$ \pause \item If {\color{blue} Blue} is `contagious' state: \pause \begin{equation} \nonumber \mathbb{P}(i \in X^{t+1}|X^t) = \sum_{j \in {\cal N}(i)\cap X^t} p_{ji} = X^t \cdot \theta_i \end{equation} \end{itemize} \end{frame} \begin{frame} \frametitle{Independent Cascade Model} \begin{itemize} \pause \item Each `infected' node $i$ has a probability $p_{i,j}$ of infecting each of his neighbors $j$. \pause \item A node stays `infected' for a single turn. Then it becomes `inactive'. $$\mathbb{P}(j \text{ becomes infected at t+1}|X_{t}) = 1 - \prod_{i \in {\cal N}(j) \cap X_{t}} (1 - p_{i,j})$$ \end{itemize} \end{frame} \begin{frame} \frametitle{Independent Cascade Model} \begin{align} \mathbb{P}(j\in X_{t+1}|X_{t}) & = 1 - \prod_{i \in {\cal N}(j) \cap X_{t}} (1 - p_{i,j}) \\ & = 1 - \exp \left[ \sum_{i \in {\cal N}(j) \cap X_{t}} \log(1 - p_{i,j}) \right] \\ & = 1 - \exp \left[ X_{t} \cdot \theta_{i,j}\right] \end{align} where $\theta_{i,j} := \log (1 - p_{i,j})$ and $\theta_i := (\theta_{i,j})_j$ \\[5ex] \begin{itemize} \item Support of $\vec \theta$ $\Leftrightarrow$ support of $\vec p$ \end{itemize} \end{frame} \begin{frame} \frametitle{Model Comparison} \begin{table} \centering \begin{tabular}{c | c} Voter Model & Indep. Casc. Model \\[1ex] \hline \\[.1ex] Markov & Markov \\[3ex] Indep. prob. of $\in X^{t+1} | X^t$ & Indep. prob. of $\in X^{t+1} | X^t$ \\[3ex] $\mathbb{P}(j\in X_{t+1}|X_{t}) = X_{t} \cdot \theta_{i}$ & $\mathbb{P}(j\in X_{t+1}|X_{t}) = 1 - \exp(X_{t} \cdot \theta_{i})$ \\[3ex] Always Susceptible & Susceptible until infected \\ \includegraphics[scale=.4]{../images/voter_model.pdf} & \includegraphics[scale=.3]{../images/icc_model.pdf} \\ \end{tabular} \end{table} \end{frame} \begin{frame} \frametitle{Generalized Linear Cascade Models} \begin{definition} {\bf Generalized Linear Cascade Model} with inverse link function $f : \mathbb{R} \rightarrow [0,1]$: \begin{itemize} \item for each \emph{susceptible} node $j$ in state $s$ at $t$, $\mathbb{P}(j \in X^{t+1}|X^t)$ is a Bernoulli of parameter $f(\theta_j \cdot X^t)$ \end{itemize} \end{definition} \end{frame} \begin{frame} \frametitle{Sparse Recovery} \begin{figure} \includegraphics[scale=.6]{../images/sparse_recovery_illustration.pdf} \caption{$f(X\cdot\theta) = b$} \end{figure} \end{frame} \begin{frame} \frametitle{$\ell1$ penalized Maximum Likelihood} \begin{itemize} \item Decomposable node by node \item Sum over susceptible steps \end{itemize} \begin{block}{Likelihood function} \begin{equation} \nonumber {\cal L}(\theta| x^1, \dots x^n) = \frac{1}{{\cal T}_i} \sum_{t \in {\cal T}_i} x^{t+1}_i \log f(\theta_i \cdot x^t) + (1 - x^{t+1}_i) \log(1 - f(\theta_i \cdot x^t)) \end{equation} \end{block} \begin{block}{Algorithm} \begin{equation} \nonumber \theta \in \arg \max_\theta {\cal L}(\theta| x^1, \dots x^n) - \lambda \|\theta\|_1 \end{equation} \end{block} \end{frame} \begin{frame} \frametitle{Main Result} \begin{theorem} Assume condition on the Hessian and certain regularity properties on $f$, then $\exists C>0$ depending only on the properties of the ${\cal G}$, with high probability: $$\| \theta^*_i - \hat \theta_i \|_2 \leq C\sqrt{\frac{s\log m}{n}}$$ \end{theorem} \begin{corollary} By thresholding $\hat \theta_i$, if $n > C' s \log m$, we recover the support of $\theta^*$ and therefore the edges of ${\cal G}$ \end{corollary} \end{frame} \begin{frame} \frametitle{Approximate Sparsity} \begin{itemize} \item $\theta^*_{\lceil s \rceil}$ best s-sparse approximation to $\theta^*$ \item $\|\theta^* - \theta^*_{\lceil s \rceil} \|_1$: `tail' of $\theta^*$ \end{itemize} \begin{theorem} Assume condition on Hessian and certain regularity properties on $f$, then $\exists C_1, C_2>0$ depending only on the properties of ${\cal G}$, with high probability: \begin{equation} \|\hat \theta_i - \theta^*_i\|_2 \leq C_1 \sqrt{\frac{s\log m}{n}} + C_2 \sqrt[4]{\frac{s\log m}{n}}\|\theta^* - \theta^*_{\lceil s \rceil} \|_1 \end{equation} \end{theorem} \end{frame} \begin{frame} \frametitle{Lower Bound} \begin{itemize} \item Under correlation decay assumption for the IC model, ${\Omega}(s \log N/s)$ cascades necessary for graph reconstruction (Netrapalli et Sanghavi SIGMETRICS'12) \item Adapting (Price \& Woodruff STOC'12), in the approximately sparse case, any algorithm for any generalized linear cascade model such that: $$\|\hat \theta - \theta^*\|_2 \leq C \|\theta^* - \theta^*_{\lfloor s \rfloor}\|_2$$ requires ${\cal O}(s \log (n/s)/\log C)$ measurement. \end{itemize} \end{frame} \begin{frame} \frametitle{(RE) assumptions} \begin{block}{Assumption on Hessian} \begin{itemize} \item Hessian has to verify a `restricted eigenvalue property' i.e smallest eigenvalue on sparse vectors is away from $0$. \end{itemize} \end{block} \begin{block}{From Hessian to Gram Matrix} \begin{itemize} \item $\mathbb{E}[X X^T]$ : `expected' Gram matrix of observations \item $\mathbb{E}[X X^T]_{i,i}$ : $\mathbb{P}$ that node $i$ is infected \item $\mathbb{E}[X X^T]_{i,j}$ : $\mathbb{P} $that node $i$ and node $j$ are infected simultaneously \end{itemize} \end{block} \end{frame} \begin{frame} \frametitle{Future Work} \begin{block}{Linear Threshold Model} \begin{itemize} \item Linear threshold model is a generalized linear cascade, with non-differential inverse link function. $$\mathbb{P}(j \in X^{t+1}|X^t) = sign(\theta_j \cdot X^t - t_j)$$ \end{itemize} \end{block} \begin{block}{Noisy Influence Maximization} \end{block} \begin{block}{Confidence Intervals} \end{block} \begin{block}{Active Learning} \end{block} \end{frame} \end{document}