1 files changed, 381 insertions, 0 deletions
diff --git a/presentation/econcs/beamer.tex b/presentation/econcs/beamer.tex
new file mode 100644
index 0000000..7d4b5c1
--- /dev/null
+++ b/presentation/econcs/beamer.tex
@@ -0,0 +1,381 @@
+\documentclass[10pt]{beamer}
+
+\usepackage{amssymb, amsmath, graphicx, amsfonts, color}
+
+\title{Estimating a Graph's Parameters from Cascades}
+\author{Jean (John) Pouget-Abadie \\ Joint Work with Thibaut (T-bo) Horel}
+\date{}
+
+\begin{document}
+
+\begin{frame}
+\titlepage
+\end{frame}
+
+\begin{frame}
+\frametitle{Example}
+\begin{figure}
+\includegraphics[scale=.25]{../images/drawing.pdf}
+\caption{A network}
+\end{figure}
+\end{frame}
+
+\begin{frame}
+\frametitle{Example}
+\begin{figure}
+\includegraphics[scale=.25]{../images/noedges_step1.pdf}
+\caption{Cascade 1: $t=0$}
+\end{figure}
+\end{frame}
+
+\begin{frame}
+\frametitle{Example}
+\begin{figure}
+\includegraphics[scale=.25]{../images/noedges_step2.pdf}
+\caption{Cascade 1: $t=1$}
+\end{figure}
+\end{frame}
+
+\begin{frame}
+\frametitle{Example}
+\begin{figure}
+\includegraphics[scale=.25]{../images/noedges_step3.pdf}
+\caption{Cascade 1: $t=2$}
+\end{figure}
+\end{frame}
+
+\begin{frame}
+\frametitle{Example}
+\begin{figure}
+\includegraphics[scale=.25]{../images/noedges_step1_cascade2}
+\caption{Cascade 2: $t=0$}
+\end{figure}
+\end{frame}
+
+\begin{frame}
+\frametitle{Example}
+\begin{figure}
+\includegraphics[scale=.25]{../images/noedges_step2_cascade2}
+\caption{Cascade 2: $t=1$}
+\end{figure}
+\end{frame}
+
+\begin{frame}
+\frametitle{Example}
+\begin{figure}
+\includegraphics[scale=.25]{../images/noedges_step3_cascade2}
+\caption{Cascade 2: $t=2$}
+\end{figure}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{Context}
+
+Notation:
+\begin{itemize}
+\item $({\cal G}, \theta)$ : graph, parameters
+\item Cascade: diffusion process of a `behavior' on $({\cal G}, \theta)$
+\item $(X_t)_c$ : set of `active' nodes at time t for cascade $c$
+\end{itemize}
+
+
+\begin{table}
+\begin{tabular}{c c}
+Graph $\implies$ Cascades & Cascades $\implies$ Graph \\ \hline
+$({\cal G}, \theta)$ is known & $(X_t)_c$ is observed \\
+Predict $(X_t) | X_0$ & Recover $({\cal G}, \theta)$ \\
+\end{tabular}
+\end{table}
+
+Summary:
+\begin{itemize}
+\item Many algorithms \emph{require} knowledge of $({\cal G}, \theta)$
+\item {\bf Graph Inference} is the problem of \emph{learning} $({\cal G}, \theta)$
+\end{itemize}
+\end{frame}
+
+\begin{frame}
+\begin{block}{Decomposability}
+Learning the graph $\Leftrightarrow$ Learning the parents of a single node
+\end{block}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{Problem Statement}
+\begin{itemize}
+\pause
+\item Can we learn ${\cal G}$ from $(X_t)_c$?
+\pause
+\item How many cascades? How many steps in each cascade?
+\pause
+\item Can we learn $\theta$ from $(X_t)_c$?
+\pause
+\item How does the error decrease with $n_{\text{cascades}}$?
+\pause
+\item Are there graphs which are easy to learn? Harder to learn?
+\pause
+\item What kind of diffusion processes can we consider?
+\pause
+\item What is the minimal number of cascades required to learn $({\cal G}, \theta)$?
+\end{itemize}
+\end{frame}
+
+\begin{frame}
+\frametitle{Notation}
+\begin{itemize}
+\item n : number of measurements
+\item N : number of cascades
+\item m : number of nodes
+\item s : degree of node considered
+\end{itemize}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{Related Work}
+
+\begin{itemize}
+\pause
+\item Can we learn ${\cal G}$ from $(X_t)_c$?
+\pause 
+\\{\color{blue} Yes}
+\pause
+\item How many cascades? How many steps in each cascade? 
+\pause
+\\ {\color{blue} poly(s)$ \log m$ cascades}
+\pause
+\item Can we learn $\theta$ from $(X_t)_c$? 
+\pause
+\\ {\color{blue} (?)}
+\pause
+\item How does the error decrease with $n_{\text{cascades}}$?
+\pause
+\\ {\color{blue} (?)}
+\pause
+\item Are there graphs which are easy to learn? Harder to learn?
+\pause
+\\{\color{blue} Sparse Graphs are easy}
+\pause
+\item What kind of diffusion processes can we consider?
+\pause
+\\{\color{blue} IC Model (discrete and continuous)}
+\pause
+\item What is the minimal number of cascades required to learn $({\cal G}, \theta)$? 
+\pause
+\\{\color{blue} (?)\dots$s \log m/s$ in specific setting}
+\end{itemize}
+\end{frame}
+
+
+
+\begin{frame}
+\frametitle{Our Work}
+\begin{itemize}
+\pause
+\item Can we learn ${\cal G}$ from $(X_t)_c$? 
+\pause
+\\{\color{blue} Yes} $\rightarrow$ {\color{red} Yes}
+\pause
+\item How many cascades? How many steps in each cascade? 
+\pause
+\\ {\color{blue} poly(s)$ \log m$ cascades} $\rightarrow$ {\color{red}   $s\log m$ measurements}
+\pause
+\item Can we learn $\theta$ from $(X_t)_c$? 
+\pause
+\\ {\color{blue} (?)} $\rightarrow$ {\color{red} Yes!}
+\pause
+\item How does the error decrease with $n_{\text{cascades}}$?  
+\pause
+\\ {\color{blue} (?)} $\rightarrow$ {\color{red} ${\cal O}(\sqrt{s\log m/n})$}
+\pause
+\item Are there graphs which are easy to learn? Harder to learn? 
+\pause
+\\ {\color{blue} Sparse Graphs are easy} $\rightarrow$ {\color{red} Approx. sparsity is also easy}
+\pause
+\item What kind of diffusion processes can we consider? 
+\pause
+\\ {\color{blue} IC Model (discrete, continuous)} $\rightarrow$ {\color{red} Large class of Cascade Models}
+\pause
+\item What is the minimal number of cascades required to learn $({\cal G}, \theta)$? 
+\pause
+\\{\color{blue} $s \log m/s$ in specific setting} $\rightarrow$ {\color{red} $s \log m/s$ for approx. sparse graphs}
+\end{itemize}
+
+\end{frame}
+
+\begin{frame}
+\frametitle{Voter Model}
+\begin{itemize}
+\pause
+\item {\color{red} Red} and {\color{blue} Blue} nodes. At every step, each node $i$ chooses one of its neighbors $j$ with probability $p_{j,i}$ and adopts that color at $t+1$
+\pause
+\item If {\color{blue} Blue} is `contagious' state:
+\pause
+\begin{equation}
+\nonumber
+\mathbb{P}(i \in X^{t+1}|X^t) = \sum_{j \in {\cal N}(i)\cap X^t} p_{ji} = X^t \cdot \theta_i
+\end{equation}
+\end{itemize}
+\end{frame}
+
+\begin{frame}
+\frametitle{Independent Cascade Model}
+\begin{itemize}
+\pause
+\item Each `infected' node $i$ has a probability $p_{i,j}$ of infecting each of his neighbors $j$.
+\pause
+\item A node stays `infected' for a single turn. Then it becomes `inactive'.
+$$\mathbb{P}(j \text{ becomes infected at t+1}|X_{t}) = 1 - \prod_{i \in {\cal N}(j) \cap X_{t}} (1 - p_{i,j})$$
+\end{itemize}
+\end{frame}
+
+\begin{frame}
+\frametitle{Independent Cascade Model}
+\begin{align}
+\mathbb{P}(j\in X_{t+1}|X_{t}) & = 1 - \prod_{i \in {\cal N}(j) \cap X_{t}} (1 - p_{i,j}) \\
+& = 1 - \exp \left[ \sum_{i \in {\cal N}(j) \cap X_{t}} \log(1 - p_{i,j}) \right] \\
+& = 1 - \exp \left[ X_{t} \cdot \theta_{i,j}\right]
+\end{align}
+
+where $\theta_{i,j} := \log (1 - p_{i,j})$ and $\theta_i := (\theta_{i,j})_j$
+\\[5ex]
+\begin{itemize}
+\item Support of $\vec \theta$ $\Leftrightarrow$ support of $\vec p$
+\end{itemize}
+\end{frame}
+
+\begin{frame}
+\frametitle{Model Comparison}
+\begin{table}
+\centering
+\begin{tabular}{c | c}
+Voter Model & Indep. Casc. Model \\[1ex]
+\hline \\[.1ex]
+Markov & Markov \\[3ex]
+Indep. prob. of $\in X^{t+1} | X^t$ & Indep. prob. of $\in X^{t+1} | X^t$  \\[3ex]
+$\mathbb{P}(j\in X_{t+1}|X_{t}) = X_{t} \cdot \theta_{i}$ & $\mathbb{P}(j\in X_{t+1}|X_{t}) = 1 - \exp(X_{t} \cdot \theta_{i})$ \\[3ex]
+Always Susceptible & Susceptible until infected \\
+\includegraphics[scale=.4]{../images/voter_model.pdf} & \includegraphics[scale=.3]{../images/icc_model.pdf} \\
+\end{tabular}
+\end{table}
+\end{frame}
+
+\begin{frame}
+\frametitle{Generalized Linear Cascade Models}
+\begin{definition}
+{\bf Generalized Linear Cascade Model} with inverse link function $f : \mathbb{R} \rightarrow [0,1]$:
+\begin{itemize}
+\item for each \emph{susceptible} node $j$ in state $s$ at $t$, $\mathbb{P}(j \in X^{t+1}|X^t)$ is a Bernoulli of parameter $f(\theta_j \cdot X^t)$
+\end{itemize}
+\end{definition}
+\end{frame}
+
+\begin{frame}
+\frametitle{Sparse Recovery}
+\begin{figure}
+\includegraphics[scale=.6]{../images/sparse_recovery_illustration.pdf}
+\caption{$f(X\cdot\theta) = b$}
+\end{figure}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{$\ell1$ penalized Maximum Likelihood}
+\begin{itemize}
+\item Decomposable node by node
+\item Sum over susceptible steps
+\end{itemize}
+
+\begin{block}{Likelihood function}
+\begin{equation}
+\nonumber
+{\cal L}(\theta| x^1, \dots x^n) =  \frac{1}{{\cal T}_i}  \sum_{t \in {\cal T}_i} x^{t+1}_i \log f(\theta_i \cdot x^t) + (1 - x^{t+1}_i) \log(1 - f(\theta_i \cdot x^t))
+\end{equation}
+\end{block}
+
+\begin{block}{Algorithm}
+\begin{equation}
+\nonumber
+\theta \in \arg \max_\theta {\cal L}(\theta| x^1, \dots x^n) - \lambda \|\theta\|_1
+\end{equation}
+\end{block}
+
+\end{frame}
+
+\begin{frame}
+\frametitle{Main Result}
+\begin{theorem}
+Assume condition on the Hessian and certain regularity properties on $f$, then $\exists C>0$ depending only on the properties of the ${\cal G}$, with high probability:
+$$\| \theta^*_i - \hat \theta_i \|_2 \leq C\sqrt{\frac{s\log m}{n}}$$
+\end{theorem}
+
+\begin{corollary}
+By thresholding $\hat \theta_i$, if $n > C' s \log m$, we recover the support of $\theta^*$ and therefore the edges of ${\cal G}$
+\end{corollary}
+
+\end{frame}
+
+\begin{frame}
+\frametitle{Approximate Sparsity}
+\begin{itemize}
+\item $\theta^*_{\lceil s \rceil}$ best s-sparse approximation to $\theta^*$
+\item $\|\theta^* - \theta^*_{\lceil s \rceil} \|_1$: `tail' of $\theta^*$
+\end{itemize}
+\begin{theorem}
+Assume condition on Hessian and certain regularity properties on $f$, then $\exists C_1, C_2>0$ depending only on the properties of ${\cal G}$, with high probability:
+\begin{equation}
+\|\hat \theta_i - \theta^*_i\|_2 \leq C_1 \sqrt{\frac{s\log m}{n}} + C_2 \sqrt[4]{\frac{s\log m}{n}}\|\theta^* - \theta^*_{\lceil s \rceil} \|_1
+\end{equation}
+\end{theorem}
+\end{frame}
+
+\begin{frame}
+\frametitle{Lower Bound}
+\begin{itemize}
+\item Under correlation decay assumption for the IC model, ${\Omega}(s \log N/s)$ cascades necessary for graph reconstruction (Netrapalli et Sanghavi SIGMETRICS'12)
+\item Adapting (Price \& Woodruff STOC'12), in the approximately sparse case, any algorithm for any generalized linear cascade model such that:
+$$\|\hat \theta - \theta^*\|_2 \leq C \|\theta^* - \theta^*_{\lfloor s \rfloor}\|_2$$
+requires ${\cal O}(s \log (n/s)/\log C)$ measurement.
+\end{itemize}
+\end{frame}
+
+\begin{frame}
+\frametitle{(RE) assumptions}
+\begin{block}{Assumption on Hessian}
+\begin{itemize}
+\item
+Hessian has to verify a `restricted eigenvalue property' i.e smallest eigenvalue on sparse vectors is away from $0$.
+\end{itemize}
+\end{block}
+
+\begin{block}{From Hessian to Gram Matrix}
+\begin{itemize}
+\item $\mathbb{E}[X X^T]$ : `expected' Gram matrix of observations
+\item $\mathbb{E}[X X^T]_{i,i}$ : $\mathbb{P}$ that node $i$ is infected
+\item $\mathbb{E}[X X^T]_{i,j}$ : $\mathbb{P} $that node $i$ and node $j$ are infected simultaneously
+\end{itemize}
+\end{block}
+\end{frame}
+
+\begin{frame}
+\frametitle{Future Work}
+
+\begin{block}{Linear Threshold Model}
+\begin{itemize}
+\item Linear threshold model is a generalized linear cascade, with non-differential inverse link function. $$\mathbb{P}(j \in X^{t+1}|X^t) = sign(\theta_j \cdot X^t - t_j)$$
+\end{itemize}
+\end{block}
+
+\begin{block}{Noisy Influence Maximization}
+\end{block}
+
+\begin{block}{Confidence Intervals}
+\end{block}
+
+\begin{block}{Active Learning}
+\end{block}
+\end{frame}
+
+\end{document}