1 files changed, 113 insertions, 34 deletions
diff --git a/notes/presentation/beamer_2.tex b/notes/presentation/beamer_2.tex
index fda5b8d..40dc769 100644
--- a/notes/presentation/beamer_2.tex
+++ b/notes/presentation/beamer_2.tex
@@ -1,6 +1,6 @@
 \documentclass[10pt]{beamer}
 
-\usepackage{amssymb, amsmath, graphicx, amsfonts, color, amsthm}
+\usepackage{amssymb, amsmath, graphicx, amsfonts, color, amsthm, wasysym}
 
 \newtheorem{proposition}{Proposition}
 
@@ -56,21 +56,43 @@ What do we know? What do we want to know?
 \frametitle{Independent Cascade Model}
 
 \begin{figure}
-\includegraphics[scale=.5]{figures/weighted_graph.png}
+\includegraphics[scale=.3]{figures/weighted_graph.png}
 \caption{Weighted, directed graph}
 \end{figure}
 
 \begin{itemize}
-\item Three states: susceptible, {\color{blue} infected}, {\color{red} dead}
-\item Each {\color{blue} infected} node $i$ has a probability $p_{i,j}$ of infecting each of his neighbors $j$.
+\item At $t=0$, nodes are in three possible states: susceptible, {\color{blue} infected}, {\color{red} dead}
+\item Each {\color{blue} infected} node $i$ has a ``one-shot'' probability $p_{i,j}$ of infecting each of his susceptible neighbors $j$ at $t+1$.
 \item A node stays {\color{blue}  infected} for one round, then it {\color{red} dies}
 \item At $t=0$, each node is {\color{blue} infected} with probability $p_{\text{init}}$
+\item Process continues until random time $T$ when no more nodes can become infected. 
+\item $X_t$: set of {\color{blue}  infected} nodes at time $t$
+\item A {\bf cascade} is an instance of the ICC model: $(X_t)_{t=0, t=T}$
 \end{itemize}
 
 %Notes: Revisit the celebrated independent cascade model -> Influence maximisation is tractable, requires knowledge of weights
 
 \end{frame}
 
+%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{frame}
+\frametitle{Independent Cascade Model}
+
+\begin{figure}
+\includegraphics[scale=.5]{figures/weighted_graph.png}
+\caption{Weighted, directed graph}
+\end{figure}
+
+\begin{block}{Example}
+\begin{itemize}
+\item At $t=0$, the {\color{orange} orange} node is infected, and the two other nodes are susceptible. $X_0 = $({\color{orange} orange})
+\item At $t=1$, the {\color{orange}} node infects the {\color{blue} blue} node and fails to infect the {\color{green} green} node. The {\color{orange} orange} node dies. $X_1 = $({\color{blue} blue})
+\item At $t=2$, {\color{blue} blue} dies. $X_3 = \emptyset$
+\end{itemize}
+\end{block}
+
+\end{frame}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%
 
@@ -83,10 +105,8 @@ What do we know? What do we want to know?
 \end{figure}
 
 \begin{itemize}
-\item If $3$ and $4$ are {\color{blue} infected} at $t$, what is the probability that node $0$ is infected at $t+1$?
+\item If $3$ and $4$ are {\color{blue} infected} at $t=0$, what is the probability that node $0$ is infected at $t=1$?
 $$1 - \mathbb{P}(\text{not infected}) = 1 - (1 - .45)(1-.04)$$
-\item In general, $X_t$ {\color{blue} infected} nodes at t:
-$$\mathbb{P}(j \text{ becomes infected at t+1}|X_{t}) = 1 - \prod_{i \in {\cal N}(j) \cap X_{t}} (1 - p_{i,j})$$
 \end{itemize}
 
 
@@ -96,36 +116,57 @@ $$\mathbb{P}(j \text{ becomes infected at t+1}|X_{t}) = 1 - \prod_{i \in {\cal N
 
 \begin{frame}
 \frametitle{Independent Cascade Model}
-\begin{proposition}
-The ICC, conditioned on previous time step, can be cast as a {\bf generalized linear model}
-$$\mathbb{P}(j \in X_{t+1} | X_t) = f(X_t \cdot \theta_j)$$
-\end{proposition}
+\begin{figure}
+\includegraphics[scale=.5]{figures/weighted_graph.png}
+\caption{Weighted, directed graph}
+\end{figure}
+
+\begin{itemize}
+\item In general, for each susceptible node $j$:
+$$\mathbb{P}(j \text{ becomes infected at t+1}|X_{t}) = 1 - \prod_{i \in {\cal N}(j) \cap X_{t}} (1 - p_{i,j})$$
+\end{itemize}
 
-\begin{proof}
-\begin{align}
+\end{frame}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{frame}
+\frametitle{Independent Cascade Model}
+For each susceptible node $j$, the event that it becomes {\color{blue} infected} conditioned on previous time step is a Bernoulli:
+$$(j \in X_{t+1} | X_t) \sim {\cal B} \big(f(X_t \cdot \theta_j) \big)$$
+\begin{itemize}
+\item $\theta_{i,j} := \log(1 - p_{i,j})$
+\item $\theta_j := (0, 0, 0, \theta_{4,j}, 0 \dots, \theta_{k,j}, \dots)$
+\item $f : x \mapsto 1 - e^x$
+\begin{align*}
 \mathbb{P}(j\in X_{t+1}|X_{t}) & = 1 - \prod_{i \in {\cal N}(j) \cap X_{t}} (1 - p_{i,j}) \\
 & = 1 - \exp \left[ \sum_{i \in {\cal N}(j) \cap X_{t}} \log(1 - p_{i,j}) \right] \\
 & = 1 - \exp \left[ X_{t} \cdot \theta_{j}\right]
-\end{align}
-\end{proof}
-
+\end{align*}
+\end{itemize}
 \end{frame}
 
+
+
 %%%%%%%%%%%%%%%%%%%%%%%%%
 
 \begin{frame}
 \frametitle{Independent Cascade Model}
+For each susceptible node $j$, the event that it becomes {\color{blue} infected} conditioned on previous time step is a Bernoulli:
+$$(j \in X_{t+1} | X_t) \sim {\cal B} \big(f(X_t \cdot \theta_j) \big)$$
+
 \begin{block}{Decomposability}
 \begin{itemize}
-\item Conditioned on $X_t$, the state of each node is sampled independently
-\item We can focus on learning vector $\theta_{j}$ for each node
+\item Conditioned on $X_t$, the state of node $j$ is sampled independently from node $j+1$
+\item We can learn the parents of each node independently
 \end{itemize}
 \end{block}
 
 \begin{block}{Sparsity}
 \begin{itemize}
-\item $\theta_{i,j} = 0 \Leftrightarrow p_{i,j} = 0$
-\item If graph is ``sparse'', then $\theta_j$ is sparse.
+\item $\theta_{i,j} = 0 \Leftrightarrow \log(1 - p_{i,j}) = 0 \Leftrightarrow p_{i,j} = 0$
+\item If graph is ``sparse'', then $p_{j}$ is sparse, then $\theta_j$ is sparse.
 \end{itemize}
 \end{block}
 \end{frame}
@@ -137,20 +178,31 @@ $$\mathbb{P}(j \in X_{t+1} | X_t) = f(X_t \cdot \theta_j)$$
 \begin{frame}
 \frametitle{Sparse Recovery}
 \begin{figure}
+\includegraphics[scale=.6]{../images/sparse_recovery_illustration_copy.pdf}
+\caption{$\mathbb{P}(j \in X_{t+1}| X_t) = f(X_t\cdot \theta)$}
+\end{figure}
+\end{frame}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{frame}
+\frametitle{Sparse Recovery}
+\begin{figure}
 \includegraphics[scale=.6]{../images/sparse_recovery_illustration.pdf}
-\caption{$f(X_t\cdot \theta) = \mathbb{P}(j \in X_{t+1}| X_t)$}
+\caption{$\mathbb{P}(j \in X_{t+1}| X_t) = f(X_t\cdot \theta)$}
 \end{figure}
 \end{frame}
 
+
 %%%%%%%%%%%%%%%%%%%%%%%
 
 \begin{frame}
 \frametitle{Learning from Diffusion Processes}
 \begin{block}{Problem Statement}
 \begin{itemize}
-\item We are given a graph ${\cal G}$, and a diffusion process parameterized by $\left((\theta_{i,j})_{i,j}, f, p_{\text{init}}\right)$. 
+\item We are given a graph ${\cal G}$, and a diffusion process $f$ parameterized by $\left((\theta_j)_j, p_{\text{init}}\right)$. 
 \item Suppose we {\bf only} observe $(X_t)$ from the diffusion process.
-\item Under what conditions can we learn $\theta_{i,j}$ for all $(i,j)$? How many $(X_t)$ are necessary?
+\item Under what conditions can we learn $\theta_{i,j}$ for all $(i,j)$?
 \end{itemize}
 \end{block}
 \end{frame}
@@ -160,13 +212,20 @@ $$\mathbb{P}(j \in X_{t+1} | X_t) = f(X_t \cdot \theta_j)$$
 \begin{frame}
 \frametitle{Learning from Diffusion Processes}
 
-\begin{figure}
-\includegraphics[scale=.4]{../images/sparse_recovery_illustration.pdf}
-\caption{Generalized Linear Model for node $i$}
-\end{figure}
+% \begin{figure}
+% \includegraphics[scale=.4]{../images/sparse_recovery_illustration.pdf}
+% \caption{Generalized Cascade Model for node $i$}
+% \end{figure}
 
 \begin{block}{Likelihood Function}
-$${\cal L}(\theta| X_1, \dots X_N) = \frac{1}{{\cal T}_i}  \sum_{t \in {\cal T}_i} x^{t+1}_i \log f(\theta_i \cdot x^t) + (1 - x^{t+1}_i) \log(1 - f(\theta_i \cdot x^t))$$
+\begin{align*}
+{\cal L}(\theta_1, \dots, \theta_m| X_1, \dots  X_n) = \sum_{i=1}^m \sum_{t} & X_{t+1}^i \log f(\theta_i \cdot X_t) + \\
+& (1 - X_{t+1}^i) \log(1 - f(\theta_i \cdot X_t))
+\end{align*}
+\end{block}
+
+\begin{block}{MLE}
+For each node $i$, $$\theta_i \in \arg \max {\cal {L}}_i(\theta_i | X_1, X_2, \dots, X_n) - \lambda \|\theta_i\|_1$$
 \end{block}
 
 \end{frame}
@@ -178,7 +237,7 @@ $${\cal L}(\theta| X_1, \dots X_N) = \frac{1}{{\cal T}_i}  \sum_{t \in {\cal T}_
 \begin{block}{On $f$}
 \begin{itemize}
 \item $\log f$ and $\log (1-f)$ have to be concave
-\item $\log f$ and $\log (1-f)$ have bounded gradient
+\item $\log f$ and $\log (1-f)$ have to have bounded gradient
 \end{itemize}
 \end{block}
 
@@ -199,9 +258,9 @@ $${\cal L}(\theta| X_1, \dots X_N) = \frac{1}{{\cal T}_i}  \sum_{t \in {\cal T}_
 
 \begin{definition}
 Let $S$ be the set of parents of node $i$.
-$${\cal C} := \{ \Delta : \|\Delta\|_2 = 1, \|\Delta_{\bar S}\| \leq 3 \| \Delta_S\|_1 \}$$
+$${\cal C} := \{ \Delta : \|\Delta\|_2 = 1, \|\Delta_{\bar S}\|_1 \leq 3 \| \Delta_S\|_1 \}$$
 ${\cal H}$ verifies the $(S, \gamma)$-RE condition if:
-$$\forall X \in {\cal C}, \Delta {\cal H} \Delta \geq \gamma$$
+$$\forall \Delta \in {\cal C}, \Delta {\cal H} \Delta \geq \gamma$$
 \end{definition}
 \end{frame}
 %%%%%%%%%%%%%%%%%%%%%%%%
@@ -214,6 +273,7 @@ Adapting a result from \cite{Negahban:2009}, we have the following theorem:
 Assume 
 \begin{itemize}
 \item the Hessian verifies the $(S,\gamma)$-RE condition
+\item $f$ and $1-f$ are log-concave
 \item $|(\log f)'| < \frac{1}{\alpha}$ and $|(\log 1- f)'| < \frac{1}{\alpha}$ 
 \end{itemize} then with high probability:
 $$\| \theta^*_i - \hat \theta_i \|_2 \leq \frac{6}{\gamma}\sqrt{\frac{s\log m}{\alpha n}}$$
@@ -228,9 +288,28 @@ By thresholding $\hat \theta_i$, if $n > C' s \log m$, we recover the support of
 %%%%%%%%%%%%%%%%%%%%%%%%
 
 \begin{frame}
-Comments: Correlated Measurements
-TODO: still need to mention somewhere that we are doing penalized log likelihood
-condition on $X_t$ is not great, we would like to have condition on the parameters $\theta, p_{\text{init}}$ $->$ Slides about expected hessian
+\frametitle{Main result}
+
+\begin{block}{Correlation}
+\begin{itemize}
+\item Measurements are correlated, which is unusual and good for us.
+\item Independent measurements $\implies$ taking one measurement per cascade.
+\end{itemize}
+\end{block}
+
+\begin{block}{Statement w.r.t observations and not the model}
+\begin{itemize}
+\item The Hessian must verify the $(S,\gamma)$-RE condition \frownie
+\item Can we make a conditional statement on $\theta$ and not $X_t$?
+\end{itemize}
+\end{block}
+
+\end{frame}
+
+%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{frame}
+Slides about expected hessian
 TODO: slide about matrice de gram!
 \end{frame}