diff options
Diffstat (limited to 'presentation/econcs/beamer.tex')
| -rw-r--r-- | presentation/econcs/beamer.tex | 381 |
1 files changed, 381 insertions, 0 deletions
diff --git a/presentation/econcs/beamer.tex b/presentation/econcs/beamer.tex new file mode 100644 index 0000000..7d4b5c1 --- /dev/null +++ b/presentation/econcs/beamer.tex @@ -0,0 +1,381 @@ +\documentclass[10pt]{beamer} + +\usepackage{amssymb, amsmath, graphicx, amsfonts, color} + +\title{Estimating a Graph's Parameters from Cascades} +\author{Jean (John) Pouget-Abadie \\ Joint Work with Thibaut (T-bo) Horel} +\date{} + +\begin{document} + +\begin{frame} +\titlepage +\end{frame} + +\begin{frame} +\frametitle{Example} +\begin{figure} +\includegraphics[scale=.25]{../images/drawing.pdf} +\caption{A network} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{Example} +\begin{figure} +\includegraphics[scale=.25]{../images/noedges_step1.pdf} +\caption{Cascade 1: $t=0$} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{Example} +\begin{figure} +\includegraphics[scale=.25]{../images/noedges_step2.pdf} +\caption{Cascade 1: $t=1$} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{Example} +\begin{figure} +\includegraphics[scale=.25]{../images/noedges_step3.pdf} +\caption{Cascade 1: $t=2$} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{Example} +\begin{figure} +\includegraphics[scale=.25]{../images/noedges_step1_cascade2} +\caption{Cascade 2: $t=0$} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{Example} +\begin{figure} +\includegraphics[scale=.25]{../images/noedges_step2_cascade2} +\caption{Cascade 2: $t=1$} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{Example} +\begin{figure} +\includegraphics[scale=.25]{../images/noedges_step3_cascade2} +\caption{Cascade 2: $t=2$} +\end{figure} +\end{frame} + + +\begin{frame} +\frametitle{Context} + +Notation: +\begin{itemize} +\item $({\cal G}, \theta)$ : graph, parameters +\item Cascade: diffusion process of a `behavior' on $({\cal G}, \theta)$ +\item $(X_t)_c$ : set of `active' nodes at time t for cascade $c$ +\end{itemize} + + +\begin{table} +\begin{tabular}{c c} +Graph $\implies$ Cascades & Cascades $\implies$ Graph \\ \hline +$({\cal G}, \theta)$ is known & $(X_t)_c$ is observed \\ +Predict $(X_t) | X_0$ & Recover $({\cal G}, \theta)$ \\ +\end{tabular} +\end{table} + +Summary: +\begin{itemize} +\item Many algorithms \emph{require} knowledge of $({\cal G}, \theta)$ +\item {\bf Graph Inference} is the problem of \emph{learning} $({\cal G}, \theta)$ +\end{itemize} +\end{frame} + +\begin{frame} +\begin{block}{Decomposability} +Learning the graph $\Leftrightarrow$ Learning the parents of a single node +\end{block} +\end{frame} + + +\begin{frame} +\frametitle{Problem Statement} +\begin{itemize} +\pause +\item Can we learn ${\cal G}$ from $(X_t)_c$? +\pause +\item How many cascades? How many steps in each cascade? +\pause +\item Can we learn $\theta$ from $(X_t)_c$? +\pause +\item How does the error decrease with $n_{\text{cascades}}$? +\pause +\item Are there graphs which are easy to learn? Harder to learn? +\pause +\item What kind of diffusion processes can we consider? +\pause +\item What is the minimal number of cascades required to learn $({\cal G}, \theta)$? +\end{itemize} +\end{frame} + +\begin{frame} +\frametitle{Notation} +\begin{itemize} +\item n : number of measurements +\item N : number of cascades +\item m : number of nodes +\item s : degree of node considered +\end{itemize} +\end{frame} + + +\begin{frame} +\frametitle{Related Work} + +\begin{itemize} +\pause +\item Can we learn ${\cal G}$ from $(X_t)_c$? +\pause +\\{\color{blue} Yes} +\pause +\item How many cascades? How many steps in each cascade? +\pause +\\ {\color{blue} poly(s)$ \log m$ cascades} +\pause +\item Can we learn $\theta$ from $(X_t)_c$? +\pause +\\ {\color{blue} (?)} +\pause +\item How does the error decrease with $n_{\text{cascades}}$? +\pause +\\ {\color{blue} (?)} +\pause +\item Are there graphs which are easy to learn? Harder to learn? +\pause +\\{\color{blue} Sparse Graphs are easy} +\pause +\item What kind of diffusion processes can we consider? +\pause +\\{\color{blue} IC Model (discrete and continuous)} +\pause +\item What is the minimal number of cascades required to learn $({\cal G}, \theta)$? +\pause +\\{\color{blue} (?)\dots$s \log m/s$ in specific setting} +\end{itemize} +\end{frame} + + + +\begin{frame} +\frametitle{Our Work} +\begin{itemize} +\pause +\item Can we learn ${\cal G}$ from $(X_t)_c$? +\pause +\\{\color{blue} Yes} $\rightarrow$ {\color{red} Yes} +\pause +\item How many cascades? How many steps in each cascade? +\pause +\\ {\color{blue} poly(s)$ \log m$ cascades} $\rightarrow$ {\color{red} $s\log m$ measurements} +\pause +\item Can we learn $\theta$ from $(X_t)_c$? +\pause +\\ {\color{blue} (?)} $\rightarrow$ {\color{red} Yes!} +\pause +\item How does the error decrease with $n_{\text{cascades}}$? +\pause +\\ {\color{blue} (?)} $\rightarrow$ {\color{red} ${\cal O}(\sqrt{s\log m/n})$} +\pause +\item Are there graphs which are easy to learn? Harder to learn? +\pause +\\ {\color{blue} Sparse Graphs are easy} $\rightarrow$ {\color{red} Approx. sparsity is also easy} +\pause +\item What kind of diffusion processes can we consider? +\pause +\\ {\color{blue} IC Model (discrete, continuous)} $\rightarrow$ {\color{red} Large class of Cascade Models} +\pause +\item What is the minimal number of cascades required to learn $({\cal G}, \theta)$? +\pause +\\{\color{blue} $s \log m/s$ in specific setting} $\rightarrow$ {\color{red} $s \log m/s$ for approx. sparse graphs} +\end{itemize} + +\end{frame} + +\begin{frame} +\frametitle{Voter Model} +\begin{itemize} +\pause +\item {\color{red} Red} and {\color{blue} Blue} nodes. At every step, each node $i$ chooses one of its neighbors $j$ with probability $p_{j,i}$ and adopts that color at $t+1$ +\pause +\item If {\color{blue} Blue} is `contagious' state: +\pause +\begin{equation} +\nonumber +\mathbb{P}(i \in X^{t+1}|X^t) = \sum_{j \in {\cal N}(i)\cap X^t} p_{ji} = X^t \cdot \theta_i +\end{equation} +\end{itemize} +\end{frame} + +\begin{frame} +\frametitle{Independent Cascade Model} +\begin{itemize} +\pause +\item Each `infected' node $i$ has a probability $p_{i,j}$ of infecting each of his neighbors $j$. +\pause +\item A node stays `infected' for a single turn. Then it becomes `inactive'. +$$\mathbb{P}(j \text{ becomes infected at t+1}|X_{t}) = 1 - \prod_{i \in {\cal N}(j) \cap X_{t}} (1 - p_{i,j})$$ +\end{itemize} +\end{frame} + +\begin{frame} +\frametitle{Independent Cascade Model} +\begin{align} +\mathbb{P}(j\in X_{t+1}|X_{t}) & = 1 - \prod_{i \in {\cal N}(j) \cap X_{t}} (1 - p_{i,j}) \\ +& = 1 - \exp \left[ \sum_{i \in {\cal N}(j) \cap X_{t}} \log(1 - p_{i,j}) \right] \\ +& = 1 - \exp \left[ X_{t} \cdot \theta_{i,j}\right] +\end{align} + +where $\theta_{i,j} := \log (1 - p_{i,j})$ and $\theta_i := (\theta_{i,j})_j$ +\\[5ex] +\begin{itemize} +\item Support of $\vec \theta$ $\Leftrightarrow$ support of $\vec p$ +\end{itemize} +\end{frame} + +\begin{frame} +\frametitle{Model Comparison} +\begin{table} +\centering +\begin{tabular}{c | c} +Voter Model & Indep. Casc. Model \\[1ex] +\hline \\[.1ex] +Markov & Markov \\[3ex] +Indep. prob. of $\in X^{t+1} | X^t$ & Indep. prob. of $\in X^{t+1} | X^t$ \\[3ex] +$\mathbb{P}(j\in X_{t+1}|X_{t}) = X_{t} \cdot \theta_{i}$ & $\mathbb{P}(j\in X_{t+1}|X_{t}) = 1 - \exp(X_{t} \cdot \theta_{i})$ \\[3ex] +Always Susceptible & Susceptible until infected \\ +\includegraphics[scale=.4]{../images/voter_model.pdf} & \includegraphics[scale=.3]{../images/icc_model.pdf} \\ +\end{tabular} +\end{table} +\end{frame} + +\begin{frame} +\frametitle{Generalized Linear Cascade Models} +\begin{definition} +{\bf Generalized Linear Cascade Model} with inverse link function $f : \mathbb{R} \rightarrow [0,1]$: +\begin{itemize} +\item for each \emph{susceptible} node $j$ in state $s$ at $t$, $\mathbb{P}(j \in X^{t+1}|X^t)$ is a Bernoulli of parameter $f(\theta_j \cdot X^t)$ +\end{itemize} +\end{definition} +\end{frame} + +\begin{frame} +\frametitle{Sparse Recovery} +\begin{figure} +\includegraphics[scale=.6]{../images/sparse_recovery_illustration.pdf} +\caption{$f(X\cdot\theta) = b$} +\end{figure} +\end{frame} + + +\begin{frame} +\frametitle{$\ell1$ penalized Maximum Likelihood} +\begin{itemize} +\item Decomposable node by node +\item Sum over susceptible steps +\end{itemize} + +\begin{block}{Likelihood function} +\begin{equation} +\nonumber +{\cal L}(\theta| x^1, \dots x^n) = \frac{1}{{\cal T}_i} \sum_{t \in {\cal T}_i} x^{t+1}_i \log f(\theta_i \cdot x^t) + (1 - x^{t+1}_i) \log(1 - f(\theta_i \cdot x^t)) +\end{equation} +\end{block} + +\begin{block}{Algorithm} +\begin{equation} +\nonumber +\theta \in \arg \max_\theta {\cal L}(\theta| x^1, \dots x^n) - \lambda \|\theta\|_1 +\end{equation} +\end{block} + +\end{frame} + +\begin{frame} +\frametitle{Main Result} +\begin{theorem} +Assume condition on the Hessian and certain regularity properties on $f$, then $\exists C>0$ depending only on the properties of the ${\cal G}$, with high probability: +$$\| \theta^*_i - \hat \theta_i \|_2 \leq C\sqrt{\frac{s\log m}{n}}$$ +\end{theorem} + +\begin{corollary} +By thresholding $\hat \theta_i$, if $n > C' s \log m$, we recover the support of $\theta^*$ and therefore the edges of ${\cal G}$ +\end{corollary} + +\end{frame} + +\begin{frame} +\frametitle{Approximate Sparsity} +\begin{itemize} +\item $\theta^*_{\lceil s \rceil}$ best s-sparse approximation to $\theta^*$ +\item $\|\theta^* - \theta^*_{\lceil s \rceil} \|_1$: `tail' of $\theta^*$ +\end{itemize} +\begin{theorem} +Assume condition on Hessian and certain regularity properties on $f$, then $\exists C_1, C_2>0$ depending only on the properties of ${\cal G}$, with high probability: +\begin{equation} +\|\hat \theta_i - \theta^*_i\|_2 \leq C_1 \sqrt{\frac{s\log m}{n}} + C_2 \sqrt[4]{\frac{s\log m}{n}}\|\theta^* - \theta^*_{\lceil s \rceil} \|_1 +\end{equation} +\end{theorem} +\end{frame} + +\begin{frame} +\frametitle{Lower Bound} +\begin{itemize} +\item Under correlation decay assumption for the IC model, ${\Omega}(s \log N/s)$ cascades necessary for graph reconstruction (Netrapalli et Sanghavi SIGMETRICS'12) +\item Adapting (Price \& Woodruff STOC'12), in the approximately sparse case, any algorithm for any generalized linear cascade model such that: +$$\|\hat \theta - \theta^*\|_2 \leq C \|\theta^* - \theta^*_{\lfloor s \rfloor}\|_2$$ +requires ${\cal O}(s \log (n/s)/\log C)$ measurement. +\end{itemize} +\end{frame} + +\begin{frame} +\frametitle{(RE) assumptions} +\begin{block}{Assumption on Hessian} +\begin{itemize} +\item +Hessian has to verify a `restricted eigenvalue property' i.e smallest eigenvalue on sparse vectors is away from $0$. +\end{itemize} +\end{block} + +\begin{block}{From Hessian to Gram Matrix} +\begin{itemize} +\item $\mathbb{E}[X X^T]$ : `expected' Gram matrix of observations +\item $\mathbb{E}[X X^T]_{i,i}$ : $\mathbb{P}$ that node $i$ is infected +\item $\mathbb{E}[X X^T]_{i,j}$ : $\mathbb{P} $that node $i$ and node $j$ are infected simultaneously +\end{itemize} +\end{block} +\end{frame} + +\begin{frame} +\frametitle{Future Work} + +\begin{block}{Linear Threshold Model} +\begin{itemize} +\item Linear threshold model is a generalized linear cascade, with non-differential inverse link function. $$\mathbb{P}(j \in X^{t+1}|X^t) = sign(\theta_j \cdot X^t - t_j)$$ +\end{itemize} +\end{block} + +\begin{block}{Noisy Influence Maximization} +\end{block} + +\begin{block}{Confidence Intervals} +\end{block} + +\begin{block}{Active Learning} +\end{block} +\end{frame} + +\end{document} |
