From 843f75943d25f4e180493142b6da0968621b9a78 Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Mon, 9 Mar 2015 13:50:20 -0400
Subject: Big reorganisation of the repo

---
 old_work/defs.tex                        |  17 ++
 old_work/formalisation.pdf               | Bin 0 -> 312600 bytes
 old_work/formalisation.tex               | 317 +++++++++++++++++++++++++++
 old_work/maximum_likelihood_approach.tex | 230 ++++++++++++++++++++
 old_work/reportYaron.tex                 | 354 +++++++++++++++++++++++++++++++
 old_work/sparse.bib                      | 105 +++++++++
 6 files changed, 1023 insertions(+)
 create mode 100644 old_work/defs.tex
 create mode 100644 old_work/formalisation.pdf
 create mode 100644 old_work/formalisation.tex
 create mode 100644 old_work/maximum_likelihood_approach.tex
 create mode 100644 old_work/reportYaron.tex
 create mode 100644 old_work/sparse.bib

(limited to 'old_work')

diff --git a/old_work/defs.tex b/old_work/defs.tex
new file mode 100644
index 0000000..ca5dd55
--- /dev/null
+++ b/old_work/defs.tex
@@ -0,0 +1,17 @@
+\newtheorem{theorem}{Theorem}[section]
+\newtheorem{lemma}[theorem]{Lemma}
+\newtheorem{proposition}[theorem]{Proposition}
+\newtheorem{corollary}[theorem]{Corollary}
+\theoremstyle{definition}
+\newtheorem*{remark}{Remark}
+
+\DeclareMathOperator{\E}{\mathbb{E}}
+\let\P\relax
+\DeclareMathOperator{\P}{\mathbb{P}}
+\newcommand{\ex}[1]{\E\left[#1\right]}
+\newcommand{\prob}[1]{\P\left[#1\right]}
+\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\eps}{\varepsilon}
+\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
diff --git a/old_work/formalisation.pdf b/old_work/formalisation.pdf
new file mode 100644
index 0000000..760abe0
Binary files /dev/null and b/old_work/formalisation.pdf differ
diff --git a/old_work/formalisation.tex b/old_work/formalisation.tex
new file mode 100644
index 0000000..c2ba8c0
--- /dev/null
+++ b/old_work/formalisation.tex
@@ -0,0 +1,317 @@
+\documentclass[10pt]{article}
+\usepackage[utf8x]{inputenc}
+\usepackage{fullpage}
+\usepackage{amsmath, amssymb, bbm, amsthm}
+\usepackage[pagebackref=true,breaklinks=true,colorlinks=true,backref=false]{hyperref}
+\usepackage[capitalize, noabbrev]{cleveref}
+\usepackage{microtype}
+\input{defs}
+
+\date{}
+\title{Cracking Cascades: Applying Sparse Recovery to Graph Reconstruction}
+\author{Jean Pouget-Abadie, Thibaut Horel}
+
+\begin{document}
+\maketitle
+
+\section{Preliminaries: What is a measurement?}
+
+There is an unknown graph $G=(V,E)$ that we wish to reconstruct by observing
+information cascades. An information cascade is a random discrete-time process
+where we get to observe the set of vertices infected at each time step.
+
+The key contribution is to formulate this problem as a sparse recovery problem
+akin to those found in the compressed sensing literature. The parallel between
+the two problems is as follows:
+\begin{itemize}
+    \item the unknown \emph{signal} $s$ is a vector in $\{0,1\}^{V\times V}$
+        with $s_{u,v} = 1$ iff. $(u,v)\in E$.
+    \item the \emph{sensing matrix} is given by the cascades. The \emph{sensing
+        matrix} maps the signal to the observation space. Since the cascades
+        are random time processes, they define a sequence of random
+        observations.
+    \item the \emph{observations} are the set of vertices infected at each time
+        step in each cascade.
+\end{itemize}
+
+Let us define $n\eqdef |V|$. If we assume that all the vertices in $G$ have
+a degree bounded by $d\in\N_+$, then the signal $s$ is $nd$-sparse. Assuming
+that the sensing matrix has good random properties, then we can hope to
+recover $s$ with $O(nd\log n)$ random observations.
+
+Two main goals:
+\begin{itemize}
+    \item understand to which extent the probabilistic cascade models translate
+        into incoherence properties of the sensing matrix and allow for an
+        accurate reconstruction of the unknown graph with a small number of
+        observations.
+    \item compare the empirical performance of this compressed sensing approach
+        to prior graph reconstruction approaches.
+\end{itemize}
+
+\begin{remark}
+\begin{itemize}
+    \item It seems that we will need a \emph{complete} probabilistic model for
+        the cascades. By complete I mean that we need to specify a joint
+        distribution on the source of the cascade (\emph{e.g.} the source is
+        chosen uniformly at random) and on the cascade diffusion process. Such
+        a model is probably not very realistic, but is necessary to obtain
+        theoretical guarantees. The unrealism of the model is not a problem
+        \emph{per se}; the eventual validity of this approach will be confirmed
+        experimentally anyway.
+    \item It seems likely that we will want the rows of the sensing matrix to
+        be independent realizations of the same distribution. However,
+        \textbf{this will never be the case} if we have one row for each time
+        step of each cascade: the rows within the same cascade are obviously
+        highly correlated. We might have to \emph{aggregate} them (for example
+        through concatenation?) in a sensible way to have with one row per
+        cascade.
+    \item Something which makes this problem very different from the usual
+        compressed sensing problems is that the measurements depend on the
+        signal we wish to recover: cascades propagate along the edges contained
+        in the signal. More precisely, not only the rows of the sensing matrix
+        are correlated (if we do not aggregate them by cascade) but also they
+        are correlated through the signal. Is this a problem? I don't know,
+        this looks scary though.
+    \item A slightly more general model which we will need in the independent
+        cascade case considers a signal $s\in[0,1]^{V\times V}$. We can always
+        go from this model to the binary case by first recovering the
+        non-binary $s$ and then applying a thresholding procedure. But if we
+        only care about recovering the support of $s$, can we do better than
+        this two-step process? (TO DO: read stuff about support recovery).
+\end{itemize}
+\end{remark}
+
+Previous stuff:
+\begin{itemize}
+\item Introduction
+\item Simple graphs (cycle, star, path, tree?)
+\item Isotropic condition
+\item RIP condition
+\end{itemize}
+
+One last property we are going to look for in measurements is the {\bf Restriced Isometry Property} (RIP), which was introduced in \cite{???} and is ubiquitous to the compressed sensing literature. Suppose we are given a measurement matrix $M$. For every $T \in \mathbb{N}$, we define $\delta_T$ to be the smallest quantity such that, for any submatrix $M_T$ of $M$ obtained by extracting $T$ columns of $M$, and for any vector $c$ such that $\| c\|_2 = 1$:
+
+$$1 - \delta_T \leq \| M_T c \|_2^2 \leq 1 + \delta_T$$
+
+For small $\delta_T$, the above equation defines a `loose' orthonormality property for the columns of $M$.
+
+\section{Warm up: the voter model}
+
+
+In the voter model, there are two types of nodes, {\it red} and {\it blue}. We fix a horizon $T$. At $t=0$, we decide on a set of {\it blue} nodes, called the seeding set.  At each time step $t$, each node $u$ chooses one of its neighbors uniformly, with probability $1/deg(u)$, and adopts the color of that neighbor. A cascade in the voter model is the evolution of which nodes $blue$ nodes for each time step $ 0 \leq t \leq T$.
+
+For simplicity, we suppose that every node has a probability $1/p_\text{init}$ of being a part of the seeding set of a cascade. The following analysis can easily be adapted to a non-uniform probability vector. We also suppose that the graphs includes self-loops, meaning the node has the option to keep his color for the next round. This is a common assumption which does not change any of the following results.
+
+\paragraph{Notation}
+Denote by $(M^t_k)_{1 \leq k \leq m; 1 \leq t \leq T}$ the set of blue nodes in cascade $k$ at time step $t$. In other words, $M^t_k$ is a vector in $\mathbb{R}^n$ such that:
+
+\[ (M^t_k)_j = \left\{ 
+  \begin{array}{l l}
+    1 & \quad \text{if $j$ is $blue$ in cascade $k$ at time step $t$}\\
+    0 & \quad \text{otherwise}
+  \end{array} \right.\]
+
+Consider a node $i$, and let $\theta_i$ be the vector representing the neighbors of $i$ in the graph {\cal G}. In other words, $\theta_i$ is a vector in $\mathbb{R}^n$ such that:
+
+\[ \theta_{ij} = \left\{ 
+  \begin{array}{l l}
+    1/deg(i) & \quad \text{if $j$ is a parent of $i$ in {\cal G}}\\
+    0 & \quad \text{otherwise}
+  \end{array} \right.\]
+
+Let $b^t_k$ be the vector representing the probability that in cascade $k$ at time step $t$ node $i$ stays or becomes $blue$.
+
+\[ b^t_k = \left\{ 
+  \begin{array}{l l}
+    (\text{Number of $blue$ neighbors in $(k, t-1$})/deg(i) & \quad \text{if $t\neq 0$}\\
+    1/p_{\text{init}} & \quad \text{otherwise}
+  \end{array} \right.\]
+
+Our objective is to find the neighbors of $i$ in {\cal G} from $(M^t_k)$ i.e recover the vector $\theta_i$. Notice that for each $(k,t)$, we have:
+
+$$\forall t< T-1, k, \ M^t_k \cdot \theta_i = b^{t+1}_k$$
+
+We can concatenate each equality in matrix form $M$, such that the rows of $M$ are the observed $blue$ nodes $M^t_k$ and the entries of $\vec b$ are the corresponding probabilities:
+
+$$M \cdot \theta_i = \vec b$$
+
+Note that if $M$ had full column rank, then we could recover $\theta_i$ from $\vec b$. This is however an unreasonable assumption, even after having observed many cascades. We must explore which assumptions on $M$ allow us to recover $\theta_i$ from a small number of cascades. Further note that we do not immediately observe $\vec b$. Instead, we observe a bernoulli vector, whose j$^{th}$ entry is equal to $1$ with probability $b_j$ and $0$ otherwise. We will denote this vector $\vec w$, and denote by $\vec \epsilon$ the quantum noise vector such that $\vec w = \vec b + \vec \epsilon$. We can therefore reformulate the problem with our observables:
+
+$$M \cdot \theta_i + \vec \epsilon = \vec w$$
+
+We hope to show that we can exploit the sparsity of vector $\theta_i$ to apply common sparse recovery techniques.
+
+\section{Independent Cascade Model}
+
+\subsection{The Model}
+
+Recall the independent cascade model, where nodes have 3 states: susceptible,
+active, and inactive. All nodes start either as susceptible or active. At each
+time step $t$, for every (active, susceptible)-pair $(j,i)$, $j$ attempts to
+infect $i$ with probability $p_{j,i}$ and become inactive. If $j$ succeeds, $i$
+will become active at time step $t+1$. The cascade continues until no active
+nodes remain.
+
+Consider a node $i$. If $p_{j,i}$ is the probability that node $j$ infects $i$
+at the next step, let $\theta_{i,j} := \log (1 - p_{j,i})$. Note that
+$\theta_{j,i} = 0 \Leftrightarrow p_{j,i} = 0$. Also note that:
+
+$$\sum_{j\in S} \theta_{j,i} = \log \prod_{j \in S} (1 - p_{j,i}) = \log \mathbb{P}(\text{i not infected by any j} \in S)$$
+
+Our objective is to recover the vector $\theta_{*,i}$ for every node $i$.
+
+\subsection{Formulating the sparse recovery problem}
+
+The idea behind what follows is the assumption that $p_{*,i}$ and therefore $\theta_{*,i}$ are sparse vectors: a node has few neighbors (parents in the directed graph) compared to the size of the graph.
+
+Let $\mathbbm{1}_S$ be the indicator variable for the nodes that are active at
+a certain time step $t$ of a certain cascade. Under the previous assumption, we
+have:
+$$\inprod{\mathbbm{1}_S, \theta_{*,i}} = \log \mathbb{P}( \text{i not infected at } t+1| S \text{ active at } t)$$
+
+Suppose we observe a series of cascades $(A_k^t)$, where $A_k^t$ are the nodes
+active in cascade $k$ at time $t$ (these are exactly the nodes which became
+infected at time $t-1$). Recall that once a node has been infected (becomes
+active), it cannot be infected in the future (is inactive). Let us therefore
+look at all the sets $(A_k^t)_{t \leq t_i^k} $ such that for every cascade $k$,
+$t_i^k > t$, where $t_i^k = + \infty$ if $i$ was not infected in that cascade.
+If we concatenate these rows $\mathbbm{1}_{A_k^t}$ into a matrix $M$ (for
+measurement), we have:
+
+$$M \theta_{*,i} = b$$
+where $b$ is a column vector corresponding to $\log \mathbb{P}(\text{i was not infected at } t+1| S \text{ active at } t)$. Note that even if we had access to $b$ directly but $M$ does not have full column rank, then solving for $x$ is not trivial! However, under sparsity assumptions on $x$, we could apply results for the sparse recovery/sparse coding/compressed sensing literature 
+
+In reality, we do not have access to $b$. We have access to the realization vector $\vec w$ of $\{i$ was infected at $t+1 | S$ active at $t\}$:
+
+$$\vec w\sim B(1 - e^{M \theta_{*,i}})$$
+
+where the operation $f: \vec v \rightarrow 1 - e^{\vec v}$ is done element-wise and $B(\vec p)$ is a vector whose entries are bernoulli variables of parameter the entries of $\vec p$.
+
+\subsection{Maximum likelihood problem}
+
+The maximum likelihood problem takes the following form:
+\begin{displaymath}
+    \begin{split}
+        \max_\theta &\sum_{k,t}\left(b_{k,t}\log\left(1-\exp\inprod{A_k^t, \theta}\right)
+        + (1-b_{k,t})\log\exp\inprod{A_k^t,\theta}\right)\\
+    \text{s.t. }& \|\theta\|_1 \leq k
+    \end{split}
+\end{displaymath}
+which we can rewrite as:
+\begin{displaymath}
+    \begin{split}
+        \max_\theta &\sum_{k,t}b_{k,t}\log\left(\exp\left(-\inprod{A_k^t, \theta}\right)-1\right)
+        + \sum_{k,t}\inprod{A_k^t,\theta}\\
+    \text{s.t. }& \|\theta\|_1 \leq k
+    \end{split}
+\end{displaymath}
+In this form it is easy to see that this is a convex program: the objective
+function is the sum of a linear function and a concave function.
+
+\paragraph{Questions} to check: is the program also convex in the $p_{j,i}$'s?
+What is the theory of sparse recovery for maximum likelihood problems? (TO DO:
+read about Bregman divergence, I think it could be related).
+
+\subsection{Results under strong assumptions}
+
+What can we hope to have etc. (If M has full column rank or if same sets S occur frequently).
+
+\subsection{Result under RIP condition}
+
+For ease of notation, let $\theta := \theta_{*,i}$ The following resolution of the above problem is heavily borrowed from \cite{2005math......3066C}. We can see $w$ as a bounded perturbation of $f(b)$: 
+$$w = f(M\theta) + \epsilon, \text{ where } \|\epsilon\|_{\infty} < 1$$
+
+Define the following program, which we can solve with all available information $M$ and $w$:
+\begin{equation}
+(N_1) \quad \min_\theta \|\theta\|_1 \quad \text{subject to} \quad \|e^{M\theta} - (1 - w) \|_2 \leq \|\epsilon \|_\infty \nonumber
+\end{equation}
+
+While we can hope for the columns of $M$ to be `loosely' orthogonal, we must normalize them if we hope for $\delta_T$ to be small. Let $M'$ correspond to $M$ where the columns have been normalized, we will therefore define $\epsilon'$ and $M'$, such that:
+$$w = f(M'\theta) + \epsilon'$$
+We see that $\epsilon'= w - (w - \epsilon)^{\vec \lambda}$ (element-wise operation), where $\lambda_i = 1/\sqrt{\sum \text{col}(M)_i}$ such that $\|\epsilon'\|_{\infty} < 1$. We can define the following program:
+
+\begin{equation}
+(N_2) \quad \min_\theta \|\theta\|_1 \quad \text{subject to} \quad \|e^{M'\theta} - (1 - w) \|_2 \leq \|\epsilon' \|_\infty \nonumber
+\end{equation}
+
+With the corresponding $\delta_T'$ defined for $M'$, we are going to prove the following:
+
+\begin{theorem}
+
+Suppose that all nodes in our graph have {\bf bounded degree by $S$}, that no edge between $i$ and its neighbors has a probability of $1$, such that $\forall j, p_{j,i} < 1 - \eta \ \text{ where } 0<\eta<1$, and that $\delta'_{3S} + 3 \delta'_{4S} < 2$. If $\theta_0$ is the true vector for node $i$, the solution $\hat \theta$ to $(N_2)$ obeys:
+$$\|\hat \theta - \theta_0 \|_2 \leq \frac{C_S}{\eta} \|\epsilon' \|_{\infty} $$ where $C_S$ is well behaved ($\approx 10$ for reasonable values of $\delta'_{4S}$)
+\end{theorem}
+
+
+\begin{proof}
+The proof borrows heavily from \cite{2005math......3066C}. Define $h := \theta - \theta_0$. We list the results as they are given in \cite{2005math......3066C} or as they can be adapted in our case.
+
+\begin{itemize}
+\item $\|e^{M'\theta} - e^{M' \theta_0} \|_2 \leq \| e^{M'\theta} - ( 1 - w) \|_2 + \| e^{M'\theta} - (1 - w) \|_2 \leq 2 \|\epsilon' \|_{\infty} $
+\item $\|h\|_2^2 \leq (1 + |T_0|/N) \|h_{T_{01}}\|_2^2$, where N is an arbitrary positive integer, and $h_{T_{01}}$ is the vector $h$, where all entries are masked to 0, except those corresponding to the non-zero entries of $\theta_0$ and the $N$ largest entries of $h_{T_0^c}$.
+\item $\|e^{M'h} \|_2 \geq |\eta| \|M'h\|_2   \geq |\eta| C_{T_0, N} \cdot \|h_{T_{0,1}} \|_2$, where $C_{T_0,N} := \sqrt{1 - \delta'_{N + T_0}}  - \sqrt{T_0 / N} \sqrt{1 + \delta'_N}$
+\end{itemize}
+The first bullet point is obtained by the triangle inequality. The second bullet point is identical to the proof in \cite{2005math......3066C}. The first inequality in the third bullet-point is obtained because by assumption all entries of $M'\theta$ are greater than $\ln \eta$ (remember that the $e^{\vec v}$ operation is done element-wise). The rest follows like in \cite{2005math......3066C}, and we obtain that:
+$$\|h\|_2 \leq 2 \frac{\sqrt{1 + T_0/N}}{ \eta \cdot C_{T_0, N}} \|\epsilon'\|_{\infty} = \frac{C_S}{\eta} \|\epsilon'\|_{\infty}$$
+For $\delta'_{4S} \leq \frac{1}{5}, C_S \approx 8.82$, for $\delta'_{4S} \leq \frac{1}{4}, C_S \approx 10.47$
+\end{proof}
+
+\begin{remark} A few notes on the result:
+\begin{itemize}
+\item With this naive reduction of $w$ as an $e-$pertubation to $f(b)$, the best $\|\epsilon\|_{\infty}$ we can hope for is $1 - \eta$. Seeing $w$ as more than an arbitrary perturbation will most likely yield a much better upper-bound.
+\item  The upper-bound is done in $\log$-space for the true probabilities:
+$$\frac{C_S}{\eta} > \sum \left[ \log \left(\frac{1 - p_j}{1 - p_j'} \right) \right]^2 > \frac{\eta \log(1/\eta)}{1 - \eta} \| \frac{1 - p_j}{1 - p'_j} - 1 \|  $$
+
+This bullet-point needs to be finished
+\item To what extent can we approximate our measurement matrix to a matrix with independent bernoulli entries? What $\delta_T$'s can we hope for? Which constant $C_S$ can we hope for? How does it change with the rows of $M$?
+\end{itemize}
+
+\end{remark}
+
+
+
+\subsection{Results under isotropic condition}
+
+
+
+% Given $M$ and $w$, we can solve the following maximum log-likelihood problem:
+% \begin{equation}
+% \max_{\|\theta_{*,i}\|_0 \leq k}  <w,\log f(M \theta_{*,i}) + <1 - w, M\theta_{*,i}>
+% \label{eq:max_loglik_obj}
+% \end{equation}
+
+% Since optimizing under the $\|\theta\|_0 \leq k$ is hard, we solve instead the Lagrangian relaxation:
+
+% \begin{equation}
+% \max_{\theta_{*,i}}  <w,\log f(M \theta_{*,i}) + <1 - w, M\theta_{*,i}> + \lambda \|\theta_{*,i}\|_0 
+% \label{eq:lagra_max_loglik_obj}
+% \end{equation}
+
+
+% Eq~\ref{eq:lagra_max_loglik_obj} is concave in $\theta_{*,i}$ and can therefore be solved efficiently. We would like to exploit the sparsity of $\theta_{*,i}$ to show that we recover the true sparse $\theta_{*,i}$ with few measurements (rows of M).
+
+
+\bibliography{sparse}{}
+\bibliographystyle{plain}
+
+\end{document}
+
+
+
+
+% Let us now consider the matrix M, whose rows are the vectors $\mathbbm{1}_S$. If we could estimate the vector $X := \mathbb{P}(i \text{ not infected at } t | S \text{ active at } t)$, we could solve for $\theta$:
+% $$M \theta_{*i} = X, \text{ where }\theta_{*,i} \text{ is a sparse vector}$$
+
+% There are many powerful tools from sparse coding theory which would allow us to recover the sparsest $\theta$ under certain assumptions on $M$. One such assumption is incoherence of the columns of $M$. In other words, if we can show that for all columns $M_i, M_j$ of M, we have $|<M_i,M_s>| \leq \mu \|M_i\| \|M_j\|$, i.e M is $\mu-incoherent$, then for any node $i$ with fewer than $k = \frac{1}{2 \mu}$ parents, we can recover $\theta_{*i}$ exactly. To get a better intuition of what this means, if the entries of $M$ were independent gaussian variables, we would need approximately $k \ln(\frac{n}{k})$ measurements to recover $\theta_{*i}$ exactly. 
+
+% This would seem to suggest that the quality of observed cascades is much more important than quantity--a notion which has not truly been explored in graph reconstruction as of yet. Another stronger assumption on the columns of M is separability (see Moitra's work in Topic Modeling) leading to even stronger results.
+
+% In reality of course, we do not know $X$. Instead, we have access to $\hat X$ which estimates $X$ in the following way. For any set S in any cascade that was active before i was, we obtain a binary signal, $\hat X_S:=0$ if $i$ was infected immediately afterwards and $\hat X_S:=1$ otherwise. As the number of times set S was observed active, we have $$\frac{1}{m} \sum \hat X_S \rightarrow \mathbb{P}(i \text{ not infected at }t+1 | S \text{ active at } t)$$.
+
+% Certain sets might have been infected many times in our set of cascades, certain sets might have been infected only once. One possible way to rewrite the program above is: $$\hat X_j = H(w_j - e^{(M \theta_{*i})_j}), \text{ where } w\sim {\cal U}[0,1] \text{ and H is the step function } \mathbbm{1}_{\cdot \geq 0}$$
+
+% This problem seems to be closely related to the growing field of 1-bit compressed sensing, with at least one difference: when scientists consider 1bitCS with noise, they are usually in a position to decide which measurement to do next to recover $\theta_{*i}$, whereas in this framework, we are limited to an observer status\footnote{Research in the other direction could be very interesting: see discussion about exploration-exploitation with Thibaut}.
+
+% I need to do more reading on the subject, namely to understand what guarantees people have found for sample complexity.
+
diff --git a/old_work/maximum_likelihood_approach.tex b/old_work/maximum_likelihood_approach.tex
new file mode 100644
index 0000000..4a22158
--- /dev/null
+++ b/old_work/maximum_likelihood_approach.tex
@@ -0,0 +1,230 @@
+\documentclass[11pt]{article}
+\usepackage{fullpage, amsmath, amssymb, amsthm}
+\usepackage{color}
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}{Lemma}
+\newtheorem{remark}{Remark}
+\newtheorem{proposition}{Proposition}
+
+\title{Maximum Likelihood Approach}
+\author{Jean Pouget-Abadie}
+
+\begin{document}
+
+\maketitle
+
+We consider the node $\alpha$. We index the measurements by $i \in [1, n]$. Let $b^i$ be the indicator variable for node $\alpha$ active at the round following measurememt $i$ and let $x^i$ be the vector of active nodes for measurement $i$. Recall that:
+
+\begin{equation}
+\label{eq:probability_of_infection}
+1 - \exp(\langle x^i, \theta \rangle) = \mathbb{P}(\text{node } \alpha \text{ is active at the following round})
+\end{equation}
+
+The likelihood problem can be formulated as such:
+
+\begin{equation}
+\label{eq:main_formulation}
+\min_{\theta \in \mathbb{R}^p} \quad \lambda_n \| \theta \|_1 + \sum^n_{i=1} - b^i \log \left(e^{-\langle x^i, \theta \rangle} - 1 \right) - \langle x^i, \theta \rangle
+\end{equation}
+
+We define $f(\theta):= \sum^n_{i=1} - b^i \log \left(\exp(-\langle x^i, \theta \rangle) \right) - \langle x^i, \theta \rangle$ such that Eq.~\ref{eq:main_formulation} can be rewritten as:
+
+\begin{equation}
+\label{eq:small_formulation}
+\min_{\theta \in \mathbb{R}^p} \quad f(\theta) + \lambda_n \| \theta \|_1
+\end{equation}
+
+We cite the following theorem from \cite{Negahban:2009} (roughly, because the statements of the theorem are either slightly wrong or unclear):
+
+\begin{proposition}
+\label{thm:cited_theorem}
+Let ${\cal C}:=\{\Delta \in \mathbb{R}^p : \exists S \subset [1, n] \ s.t. \ \|\Delta_{S^c}\|_1 \leq 3 \| \Delta_S \|_1 \}$. Suppose that $\theta^*$ is s-sparse, and the following two conditions are met: 
+\begin{equation}
+\lambda_n \geq 2 \|\nabla f(\theta^*) \|_\infty
+\label{eq:lambda_condition}
+\end{equation}
+\begin{equation}
+\forall \Delta \in {\cal C}, \ \Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta \geq \gamma_n \| \Delta \|_2^2
+\label{eq:RSC_condition}
+\end{equation}
+then:
+\begin{equation}
+\| \theta - \theta^* \|_2 \leq \frac{\sqrt{s} \lambda_n}{\gamma_n}
+\end{equation}
+\end{proposition}
+
+It remains to show the two conditions for Proposition~\ref{thm:cited_theorem} are met.
+
+\section*{Condition~\ref{eq:lambda_condition}}
+Condition~\ref{eq:lambda_condition} requires us to find an upper-bound for $ 2 \|\nabla f(\theta^*) \|_\infty$. If we only consider the first measurement of every cascade, this can be done easily. Let $N$ be the number of cascades (to distinguish from $n$ number of total measurements). Begin by noting that:
+
+\begin{equation}
+\nabla_k f(\theta) = \sum^n_{i=1} \frac{b^i x^i_k}{1 - e^{\langle x^i, \theta \rangle}} - \sum^n_{i=1} x^i_k = \sum_{i=1}^n x^k_i \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right)
+\end{equation}
+
+\begin{lemma}
+\label{lem:subgaussian_variable}
+$\nabla f(\theta^*)$ is a $N/p_{\min}$-subgaussian variable, where $p_{\min}$ is the smallest non-zero link to node $\alpha$.
+\end{lemma}
+
+\begin{proof}
+\begin{align}
+\mathbb{E} \left( \nabla_k f(\theta) \right) & = \sum_{i=1}^N \mathbb{E} \left[ x^i_k \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right) \right] \nonumber \\
+& = \sum_{i=1}^N \mathbb{E}_S \left[ \mathbb{E}\left[x^i_k \left( \frac{b^i}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right) \middle| S \right] \right] \quad \text{where S is the seed set} \nonumber \\
+& = \sum_{i=1}^N  \mathbb{E}\left[x^i_k \left( \frac{ \mathbb{E}_S \left[ b^i \middle| S \right]}{\mathbb{P}(\text{node } \alpha \text { infected})} - 1\right)  \right] \nonumber \\
+& = 0
+\end{align}
+Therefore, $\nabla f(\theta^*)$ is the sum of zero-mean variables, upper-bounded by $1/p_{\min}$. It follows that $\nabla f(\theta^*)$ is $N/p_{\min}$-subgaussian.
+\end{proof}
+
+By union bound and characterization of sub-gaussian variables:
+
+\begin{equation}
+\mathbb{P}(\| \nabla f(\theta) \|_{\infty} > \lambda) \leq 2 \exp \left( -\frac{\lambda^2 p_{\min}}{2n} + \log p \right)
+\end{equation}
+
+In conclusion, for $\delta>0$, $\lambda := 2 \sqrt{\frac{n^{\delta + 1} \log p}{p_{\min}}}$ meets Condition~\ref{eq:lambda_condition} with probability $1 - \exp(-n^\delta \log p )$
+
+
+\section*{Condition~\ref{eq:RSC_condition}}
+
+Note that:
+\begin{equation}
+\nabla_{kj} f(\theta) = \sum_{i=1}^n \frac{b^i x_k^i x_j^i e^{\langle x^i, \theta \rangle}}{\left(1 - e^{\langle x^i, \theta \rangle} \right)^2} = \sum_{i=1}^n b^i x_k^i x_j^i \frac{\mathbb{P}(\text{node } \alpha \text { not infected})}{\mathbb{P}(\text{node } \alpha \text { infected})^2}
+\end{equation}
+
+
+We are going to explicitate a constant $\gamma$ such that: $\forall \Delta \in {\cal C}, \Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta \geq \gamma n \|\Delta\|_2^2$. 
+
+\paragraph{Notation} Let $p_i := \mathbb{P}(\text{node } \alpha \text { infected})$. Let $Z^i_k := b^i x^i_k \frac{1-p_i}{p_i^2}$and let $Z^i_{k,j} := b^i x^i_k x^i_j \frac{1-p_i}{p_i^2}$. We also define $Z_k := \sum_i Z^i_k$ and $Z_{k,j} := \sum_i Z^i_{k,j}$.
+
+\begin{align}
+\Delta^T \cdot \nabla^2 f(\theta^*) \cdot \Delta & = \sum_k \Delta_k^2 \left[ \sum_i b^i x_k^i \frac{1 - p_i}{p_i^2} \right] + 2 \sum_{k< j} \Delta_k \Delta_j \left[ \sum_i b^i x^i_k x^i_j \frac{1 - p_i}{p_i^2}\right] \nonumber \\ 
+& = \sum_k \Delta_k^2 Z_k + 2 \sum_{k< j} \Delta_k \Delta_j Z_{k,j} \nonumber
+\end{align}
+
+
+
+\begin{lemma}
+\label{lem:first_term}
+Suppose that $\forall k$, $\mathbb{E}_{S(k)} \left[ \frac{1}{p_i} \right] \geq 1 + \alpha$, then with probability greater than $1 - 2 p e^{-2 \alpha^2 (1-c)^2p_{\min}^2 p_{\text{init}}^2 N}$, $$\forall k, \ Z_k > c \alpha p_{\text{init}} N$$
+\end{lemma}
+
+\begin{proof}
+Let $S(k)$ denote the active set conditioned on the fact that node $k$ is active AND that one parent is active. We denote $p_{S(k)}$ the probability that the active set verifies the previous two conditions.
+\begin{align}
+\nonumber
+\mathbb{E}[Z^i_k] & = p_{S(k)} \mathbb{E}_{S(k)} \left[ \mathbb{E}[b^i | S(k)] \frac{1 - p_i}{p_i^2} \right] \\ \nonumber
+& = p_{S(k)} \left( \mathbb{E}_{S(k)} \left[ \frac{1}{p_i} \right] - 1 \right) \\ \nonumber
+& \geq \alpha p_{S(k)} \quad \text{by assumption}
+\end{align}
+
+Note that $|Z^i_k| < \frac{1}{p_{\text{min}}^2}$ {\it a.s.}. By Hoeffding's first inequality, for $0<c <1$,
+
+\begin{align}
+\nonumber
+\mathbb{P}\left(Z_k < c \alpha p_{\text{init}} N \right) & < 2 e^{- \frac{2(1-c)^2}{Nb^2} \left( \mathbb{E}_{S(k)} \left[ \frac{1}{p_i} \right] - 1 \right)^2} \\ \nonumber
+& < 2 e^{-2 \alpha^2 (1-c)^2p_{\min}^2 p_{S(k)}^2 N}
+\end{align}
+
+We conclude by union bound.
+\end{proof}
+
+\begin{lemma}
+Suppose that $\forall k,j$, $1 + \alpha \leq \mathbb{E}_{S(k, j)} \left[ \frac{1}{p_i} \right] \leq 1 + \beta$, then with probability greater than $1 - pe^{-2 \alpha^2 (1-c)^2p_{\min}^2 p_{S(k,j)}^2 N}$, $$\forall k,j, \ Z_{k,j} < c \beta p_{S(k,j))} N$$
+\end{lemma}
+
+\begin{proof}
+We follow the same reasoning as Lemma~\ref{lem:first_term}:
+\begin{align}
+\nonumber
+\mathbb{E}[Z^i_{k,j}] & = p_{S(k,j)} \left( \mathbb{E}_{S(k,j)} \left[ \frac{1}{p_i} \right] - 1 \right) \\ \nonumber
+& \leq \beta p_{S(k,j)} \quad \text{by assumption}
+\end{align}
+
+By Hoeffding's second inequality, for $0 < c < 1$,
+\begin{align}
+\nonumber
+\mathbb{P}\left(Z_{k,j} > c \beta p_{S(k,j)} N \right) & \leq e^{- \frac{2(1-c)^2}{Nb^2} \left( \mathbb{E}_{S(k,j)} \left[ \frac{1}{p_i} \right] - 1 \right)^2} \\ \nonumber
+& \leq e^{-2 \alpha^2 (1-c)^2p_{\min}^2 p_{S(k,j)}^2 N} 
+\end{align}
+We conclude by union bound.
+\end{proof}
+
+\begin{proposition}
+Suppose that $\forall k,j$, $1 + \alpha \leq \mathbb{E}_{S(k, j)} \left[ \frac{1}{p_i} \right] \leq 1 + \beta$, then with probability greater than $1 - XXX$, condition~\ref{eq:RSC_condition} is met with $\gamma_n = \gamma n$ where $\gamma := \alpha p_{S(k)} - 16 \sqrt{s} \beta p_{S(k,j)}$
+\end{proposition}
+
+\begin{proof}
+(Sketch) By the triangle inequality followed by MacLaurin's inequality,
+\begin{align}
+\mid \frac{2}{\binom{n}{2}} \sum_{i<j} \Delta_k \Delta_j \mid & \leq \frac{1}{n^2} \sum_k \mid \Delta_k \mid \nonumber \\
+\mid 2 \sum_{i<j} \Delta_k \Delta_j \mid & \leq \|\Delta\|_1 \leq 4 \sqrt{s} \| \Delta \|_2 \quad \text{ since } \Delta \in {\cal C} \nonumber
+\end{align}
+\end{proof}
+
+\paragraph{Hoeffding's inequality}
+
+For $t \in \mathbb{R}$ and independent variables $Z_i$ such that $|Z_i|<b$ {\it a.s.}, we have:
+\begin{align}
+\label{eq:hoeffding_inequality}
+\mathbb{P} \left( \middle| \sum_{i=1}^N Z_i - \mathbb{E}\left[\sum_{i=1}^N Z_i \right] \middle| \geq t \right) \leq 2 \exp\left(- \frac{2 t^2}{N b^2} \right) \nonumber \\
+\mathbb{P} \left(\sum_{i=1}^N Z_i \geq \mathbb{E}\left[\sum_{i=1}^N Z_i \right] + t \right) \leq \exp\left(- \frac{2 t^2}{N b^2} \right) \nonumber
+\end{align}
+
+
+% \subsection*{First term}
+
+% We are first going to find a lower-bound for $\sum_i b^i x_k^i \frac{1 - p_i}{p_i^2}$ by computing a lower-bound on its expectation and concluding with Hoeffding's inequality. If we only consider the first measurement of every cascade and we further suppose that $p_i < 1 - \eta$ no matter the configuration of active nodes (slightly less strong than correlation decay).
+
+% \begin{align}
+% \mathbb{E}\left(\sum_i b^i x_k^i \frac{1 - p_i} {p_i}^2 \right) & = \sum_i \mathbb{E} \left(x_k^i \frac{1 - p_i} {p_i}^2  \right) \nonumber \\
+% & = \sum_i \mathbb{P}(b^i =1 | x_k^i =1) \mathbb{P}(x^i_k =1) \mathbb{E}\left(\frac{1 - p_i}{p_i^2} \middle| b^i =1 = x_k^i \right) \nonumber \\
+% & \geq \sum_{i=1}^n p_{\min} \cdot \min \left(1 , 1 - (1 - p_{init})^s \right)\cdot p_{init} \cdot \frac{\eta}{(1 - \eta)^2} \nonumber \\
+% & \geq s p_{init}^2 p_{\min} \frac{\eta}{(1 - \eta)^2} \quad \text{si }s < \frac{1}{p_{init}} \nonumber \\
+% & \geq p_{init} p_{\min} \frac{\eta}{(1 - \eta)^2} \quad \text{si }s > \frac{1}{p_{init}} \nonumber
+% \end{align}
+
+% We can conclude using the following Hoeffding inequality for independent random variables bounded by $[0, b_i]$ by noticing that our variables are bounded by above by $\frac{1 - p_{\min}}{p_{\min}^2}$
+
+% \paragraph{Hoeffding's inequality}
+% \begin{equation}
+% \label{eq:hoeffding_inequality}
+% \mathbb{P} \left(\sum Z_i \geq \mathbb{E}[\sum Z_i] - t \right) \leq \exp\left(- \frac{2 N t^2}{b^2} \right)
+% \end{equation}
+
+% It follows that for $c<1$ with probability $1 - \exp \left( - n^3 c^2 s^2 p_{init}^4 p_{\min}^6 \frac{\eta^2}{(1 - \eta)^4} \right)$, we have that $$\sum_k \Delta_k^2 \left[ \sum_i b^i x_k^i \frac{1 - p_i}{p_i^2} \right] \geq  \gamma N =: (1 -c) s p_{init}^2 p_{\min} \frac{\eta}{(1 - \eta)^2} N$$
+
+% \begin{remark}
+% Would it be possible to extend this result using Azuma's inequality on Martingales to not just the first measurement of every cascade?
+% \end{remark}
+
+% \subsection*{Second term}
+% We are now going to find an upper-bound on the term $\sum_i b^i x^i_k x^i_j \frac{1 - p_i}{p_i^2}$.
+
+
+\section*{Conclusion}
+
+Suppose we show that Condition~\ref{eq:RSC_condition} is met for $\gamma_n = \gamma N$, then we have the following theorems:
+
+\begin{theorem}
+\label{thm:l2_bound}
+Suppose that $\theta^* \in \mathbb{R}^p$ is s-sparse and that we choose $\lambda_n = 2 \sqrt{\frac{n^{\delta + 1} \log p}{p_{\min}}}$ for $\delta >0$, then with probability $1 - \exp(-n^\delta \log p )$, we have
+\begin{equation}
+\|\hat \theta - \theta^* \|_2 \leq \frac{2}{\gamma} \sqrt{\frac{s \log p}{p_{\min} N^{1 - \delta}}}
+\end{equation}
+\end{theorem}
+
+Note that we can choose $\delta = 0$ in high-dimensions since the probability of success will then be $1 - \frac{1}{p} \approx 1$. We can also conclude on support recovery with the following reasoning. 
+
+\begin{theorem}
+\label{thm:support_recovery}
+Suppose that $N$ is chosen such that $\frac{2}{\gamma}\sqrt{\frac{s \log p}{p_{\min} N^{1 -\delta}}} < \eta$ and suppose we only keep as elements of the support of $\theta^*$ the coordinates $\hat \theta_i > \eta$. Then no wrong parent will be included, and all `strong' parents will be included, where `strong' signifies: $\theta^*_i > 2 \eta$.
+\end{theorem}
+
+It follows that we have found an ${\cal O}(s \log p)$ algorithm for recovering the graph, with better constants and fewer assumptions than any previous work.
+
+\bibliography{sparse}
+\bibliographystyle{plain}
+
+\end{document}
\ No newline at end of file
diff --git a/old_work/reportYaron.tex b/old_work/reportYaron.tex
new file mode 100644
index 0000000..f4878c7
--- /dev/null
+++ b/old_work/reportYaron.tex
@@ -0,0 +1,354 @@
+\documentclass[10pt]{article}
+\usepackage{fullpage, amsmath, amsthm, amssymb}
+\usepackage{graphicx, algorithm, algpseudocode}
+\usepackage[utf8x]{inputenc}
+\newtheorem{theorem}{Theorem}
+
+\title{Sparse Recovery for Graph Reconstruction}
+
+\date{}
+
+\begin{document}
+
+\maketitle
+
+\section{Introduction}
+
+Given a set of observed cascades, the graph reconstruction problem consists of finding the underlying graph on which these cascades spread. We explore different models for cascade creation, namely the voter model and the independent cascade model. Our objective is therefore to approximately compute the `inverse' of the cascade generation function from as few observations as possible.
+
+\section{Related Work}
+
+There have been several works tackling the graph reconstruction problem in variants of the independent cascade. We briefly summarize their results and approaches below.
+
+\begin{itemize}
+\item Manuel Gomez-Rodriguez, Jure Leskovec, and Andreas Klause were amongst the first to tackle the problem of graph reconstruction from cascade observations \cite{gomez}. Their first method \textsc{NetInf}, and a series of other similar methods published subsequently, solves a heuristic which approximates a NP-hard maximum-likelihood formulation. Unfortunately \textsc{NetInf} has no known theoretical guarantees. One difficulty with comparing ourselves to their heuristic is that they assume a continuous time model, which is close but un-adaptable to our framework. Namely, as we will see in Section~\ref{sec:icc}, nodes in our model cannot influence nodes beyond the time step at which they are active.
+\item Their paper was later followed by a publication from Praneeth Netrapalli and Sujay Sanghavi \cite{netrapalli}, which provides two algorithms with interesting theoretical guarantees. 
+  \begin{itemize}
+  \item The first algorithm they discuss is a convex and parallelizable optimization problem. The algorithm is guaranteed to achieve perfect reconstruction of `strong' edges after ${\cal O}(\Delta^2 \log n)$ with high probability, where $\Delta$ is the maximum degree in the graph. There are several things to note about this result:
+  \begin{itemize}
+    \item Researchers believe the true threshold of cascades necessary to achieve perfect reconstruction is closer to ${\cal O}(\Delta \log n)$.
+    \item The big ${\cal O}$ is highly polynomial in the different constants of the model. The constant is of the order of billions even for very generous initializations of these parameters
+    \item The authors make a restrictive assumption, called `correlation decay', on the probabilities of each edge: $\forall i, \sum_{j \in {\cal N}(i)} p_{j,i} < 1 -\alpha$, where $0 < \alpha < 1$. 
+  \end{itemize}
+  \item The second algorithm suggested in \cite{netrapalli}, to which we will compare ourselves in Section~\ref{sec:exp}, is \textsc{Greedy}. It achieves perfect reconstruction after ${\cal O}(d \log n)$ observed cascades with high probability. However, this guarantee has been proven only in the case of tree-graphs. The authors mention that it does fairly well in pratice on other types of graphs.
+  \end{itemize}
+\item Finally, a recent paper by Bruno Abrahao, Flavio Chierichetti, Robert Kleinberg, Alessandro Panconesi gives several algorithms and a different approach to obtaining theoretical guarantees. Their strongest result is for bounded-degree graphs, where they show that they can obtain perfect reconstruction after observing ${\cal O}(\Delta^9 \log^2 \Delta \log n)$ algorithm. This is weaker than the first algorithm suggested by \cite{netrapalli}, but does not make the `correlation decay' assumption. 
+\end{itemize}
+
+What we aim for with the following framework is to achieve theoretical guarantees promising close-to-perfect reconstruction with observing ${\cal O}(\Delta \log n)$ cascades with high probability without relying on unreasonable assumptions like `correlation decay'. We also want our method to be easily implementable in practice and compare favorably to the above algorithms.
+
+\section{The Voter Model}
+
+\subsection{Description}
+In the voter model, there are two types of nodes, \textsc{red} and \textsc{blue}. We fix a horizon $T > 0$. At $t=0$, we decide on a set of \textsc{blue} nodes, called the seed set. Every other node is colored $\textsc{red}$. At each time step, each node $u$ chooses one of its neighbors uniformly, with probability $1/deg(u)$, and adopts the color of that neighbor. A cascade in the voter model can be described as the evolution of which nodes are \textsc{blue} for each time step $0\leq t\leq T$.
+
+Several variants of the model can be formulated, namely whether or not nodes have self-loops and maintain their color with a certain probability or whether the probability that a node is part of a seed set is uniform versus non-uniform, independent versus correlated. For simplicity, we suppose here that nodes do not have self-loops and that the probability that a node is part of the seed set is indepedent and equal to $1/p_{\text{init}}$ for $0 < p_{\text{init}} < 1$.
+
+\subsection{Formulation}
+
+We are going to graph reconstruction problem from voter-model type cascades as a sparse recovery problem. We introduce the following notation:
+
+\paragraph{Notation}
+Denote by $(\vec m^t_k)_{1 \leq k \leq m; 1 \leq t \leq T}$ the set of blue nodes in cascade $k$ at time step $t$. In other words, $\vec m^t_k$ is a vector in $\mathbb{R}^n$ such that:
+
+\[ (\vec m^t_k)_j = \left\{ 
+  \begin{array}{l l}
+    1 & \quad \text{if $j$ is \textsc{blue} in cascade $k$ at time step $t$}\\
+    0 & \quad \text{otherwise}
+  \end{array} \right.\]
+
+Let $\vec x_i$ be the vector representing the neighbors of a node $i$ in the graph ${\cal G}$. In other words, $\vec x_i$ is a vector in $\mathbb{R}^n$ such that:
+
+\[ (\vec x_{i})_j = \left\{ 
+  \begin{array}{l l}
+    1/\text{deg(i)} & \quad \text{if $j$ is a parent of $i$ in ${\cal G}$}\\
+    0 & \quad \text{otherwise}
+  \end{array} \right.\]
+
+Let $w^t_{k,i}$ be the scalar representing the probability that in cascade $k$ at time step $t$ node $i$ stays or becomes \textsc{blue}.
+
+\[ w^t_{k,i} = \left\{ 
+  \begin{array}{l l}
+    \frac{\text{Number of \textsc{blue} neighbors of i in $(k, t-1)$}}{\text{deg}(i)} & \quad \text{if $t\neq 0$}\\
+    1/p_{\text{init}} & \quad \text{otherwise}
+  \end{array} \right.\]
+
+Our objective is to recover the neighbors for each node $i$ in ${\cal G}$. This corresponds to recovering the vectors $(\vec x_i)_{i \in {\cal G}}$. Notice that for each cascade $k$ and every time step $t < T - 1$, we have:
+
+\begin{equation}
+\langle \vec m_k^t, \vec x_i \rangle = \mathbb{P}(\text{node $i$ is \textsc{blue} in cascade k at $t+1$}) = w_{k,i}^{t+1} \nonumber
+\end{equation}
+
+We can concatenate each equality in matrix form $M$, such that the rows of $M$ are the row vectors $(\vec m^t_k)^T$, and for each node $i$, the entries of $\vec w_i$ are the corresponding probabilities:
+
+$$M \vec x_i = \vec w_i$$
+
+Note that if $M$ had full column rank, then we could recover $\vec x_i$ from $\vec b$. This is however an unreasonable assumption, even after having observed many cascades. We must explore which assumptions on $M$ allow us to recover $\vec x_i$ when the number of observed cascades is small. 
+
+Further note that we do not immediately observe $\vec w_i$. Instead, we observe a bernoulli vector, whose j$^{th}$ entry is equal to $1$ with probability $(\vec w_i)_j$ and $0$ otherwise, corresponding to whether at the next time step $i$ became or stayed \textsc{blue}. We will denote this vector $\vec b_i$, and denote by $\vec \epsilon_i$ the zero-mean noise vector such that:
+
+$$\vec w_i = \vec b_i + \vec \epsilon_i$$ 
+
+For each node $i$, we can therefore reformulate the problem with our observables:
+
+\begin{equation}
+\label{eq:sparse_voter}
+M \vec x_i = \vec b_i + \epsilon_i
+\end{equation}
+
+If the vector $\vec x_i$ is sufficiently sparse, i.e. node $i$ has sufficiently few neighbors, then we hope to apply common sparse recovery techniques to solve for $\vec x_i$.
+
+\subsection{Example}
+
+
+\begin{figure}
+\centering
+\includegraphics[width=0.3\textwidth]{images/voter.png}
+\caption{A cascade in the voter model with time steps $t = 0,1,2$  over a graph with 5 vertices}
+\end{figure}
+
+
+
+
+To recover the neighbors of $v_1$, we get the following matrix $M$ for the example in Figure 1:
+
+\begin{align*}
+&\hspace{0.35cm} v_2 \hspace{0.2 cm}  v_3 \hspace{0.2 cm} v_4 \hspace{0.2 cm} v_5 &\\
+\vspace{1 cm}
+M  = &  \left( \begin{array}{cccc}
+0 & 0 & 1 & 1 \\
+1 & 1 & 0 & 0 \\
+\end{array} \right)  &  \begin{array}{l} \hspace{ -4.5 cm} 
+\text{time step 0} \\
+ \hspace{ - 4.5cm}  \text{time step 1} \\
+\end{array}
+\end{align*}
+
+and the vector $b_1$:
+
+\begin{align*}
+b_1  = &  \left( \begin{array}{c}
+1 \\
+1 \\
+\end{array} \right) &  \begin{array}{l} \hspace{ - 5cm} 
+\text{time step 1} \\
+ \hspace{ - 5cm}  \text{time step 2} \\
+ \end{array}
+\end{align*}
+
+\section{The Independent Cascade Model}
+\label{sec:icc}
+\subsection{Description}
+
+In the independent cascade model, nodes have three possible states: susceptible, infected or active\footnote{The words `infected' and `active' are used interchangeably}, and inactive. All nodes start either as susceptible or infected. The infected nodes at $t=0$ constitute the seed set. Similarly to the voter model, we will suppose that nodes are part of the seed set with independent probability $p_{\text{init}}$. At each time step $t$, for every pair of nodes $(j,i)$ where $j$ is infected and $i$ is susceptible, $j$ attempts to infect $i$ with probability $p_{j,i}$. If $j$ succeeds, $i$ will become active at time step $t+1$. Regardless of $j$'s success, node $j$ will be inactive at time $t+1$: nodes stay active for only one time step. The cascade continues until time step $T_k$ when no active nodes remain.
+
+\subsection{Formulation}
+
+We are going to graph reconstruction problem from independent-cascade-model-type cascades as a sparse recovery problem. We introduce the following notation:
+
+\paragraph{Notation}
+We adapt slightly the definition of $(\vec m^t_k)$:
+
+\[ (\vec m^t_k)_j = \left\{ 
+  \begin{array}{l l}
+    1 & \quad \text{if $j$ is \textsc{active} in cascade $k$ at time step $t$}\\
+    0 & \quad \text{otherwise}
+  \end{array} \right.\]
+
+Similarly to the voter model, if $\vec p_i$ is the vector representing the `parents' of node $i$ in the graph ${\cal G}$ such that $(\vec p_i)_j$ is the probability that node $j$ infects $i$, then $\vec \theta_i$ is the vector:
+
+\[ (\vec \theta_{i})_j = \left\{ 
+  \begin{array}{l l}
+    \log ( 1- p_{j,i}) & \quad \text{if $j$ is a parent of $i$ in ${\cal G}$}\\
+    0 & \quad \text{otherwise}
+  \end{array} \right.\]
+
+Note that if $p_{j,i} = 0$, then $\theta_{j,i} = 0$ and we can write $\vec \theta = \log (1 - \vec p)$ element-wise. For $t>0$, let $w^t_{k,i}$ be the scalar representing the probability that in cascade $k$ at time step $t$ node $i$ becomes \textsc{active}, conditioned on the state of cascade  $k$ at time step $t-1$. As long as node $i$ has not yet been infected in cascade $k$, we can write as in the voter model, we can observe that: 
+
+$$\langle \vec m_k^t, \vec \theta_i \rangle = \vec w^t_{k,i}$$
+
+As in the voter model, by concatenating all row vectors $\vec m^t_{k,i}$ such that $t < t^k_i$, where $t^k_i$ is the infection of node $i$ in cascade $k$, in the matrix $M_i$, we can write:
+
+$$M_i \vec \theta_i = \vec w_i$$
+
+Once again, we do not have access to $\vec w_i$. Instead, for $t<T_k$, we have access to the bernoulli variable $b_i$ of parameter $1 - e^{w_i}$, indicating whether or not node $i$ became active at time step $t+1$ in cascade $k$. Recovering $\vec \theta_i$ for every node $i$ can be written as the following problem:
+
+\begin{equation}
+e^{M_i \vec \theta_i} = 1 - \vec b_i + \vec \epsilon_i
+\end{equation}
+
+Note that this is not exactly a sparse recovery model due to the non-linear exponential term. It is of the author's opinion however that if the probabilities $p_{j,i}$ are restrained to a bounded interval $[0, 1- \eta]$, then most of the results which hold for the linear voter model will continue to hold in this case.
+
+\subsection{Example}
+
+\begin{figure}
+\centering
+\includegraphics[width=0.3\textwidth]{images/icc.png}
+\caption{A cascade in the independent cascade model with time steps $t = 0,1,2$  over a graph with 5 vertices}
+\end{figure}
+
+
+
+To recover the neighbors of $v_5$, we get the following matrix $M$ for the example in Figure 2:
+\begin{align*}
+\centering
+&\hspace{0.35cm} v_1 \hspace{0.2 cm}  v_2 \hspace{0.2 cm} v_3 \hspace{0.2 cm} v_4 \\
+M  = &  \left( \begin{array}{cccc}
+1 & 0 & 0 & 0 \\
+0 & 1 & 1 & 0 \\
+\end{array} \right)   \begin{array}{l} 
+\text{time step 0} \\
+  \text{time step 1} \\
+ \end{array}
+\end{align*}
+
+and the vector $b_5$:
+
+\begin{align*}
+b_5  = &  \left( \begin{array}{c}
+0 \\
+1 \\
+\end{array} \right)  \begin{array}{l}
+\text{time step 1} \\
+ \text{time step 2} \\
+ \end{array}
+\end{align*}
+
+
+
+\section{Sparse Recovery}
+
+\subsection{Introduction}
+The literature on sparse recovery has expanded rapidly since a series of papers were published by Candès, Tao and Romberg in 2006. In 8 years, over 7000 articles have been written on the topic since these few seminal publications. Many of them show that if the matrix $M$ is `well-conditioned', then any $s$-sparse signal can be recovered from ${\cal O}(s \log n)$ measurements (a row in the matrix $M$). We explored several of these papers to understand which conditions on $M$ were most reasonable.
+
+\subsection{Exploration}
+\paragraph{Disclosure}
+After an unsatisfactory attempt at adapting \cite{candestao}, we cite the main results of \cite{candes} and give arguments as to why we believe, with more work, these assumptions can be made reasonable in the case our model. Due to the limited time frame of this project, we were unable to find a complete and rigorous argumentation as to why these assumptions hold. The following paragraphs should be understood as motivation for this line of research rather than concrete guarantees.
+
+\paragraph{Framework}
+Suppose that each row $m$ of $M$ is taken from a distribution ${\cal F}$, such that $\mathbb{E}(m m^T) = I_n$. Suppose further that $\|m\|_{\infty} \leq \mu$ and that $\|M^T \epsilon\|_{\infty} < 2.5 \sqrt{\log n}$. For every node $i$, consider the following optimization program:
+
+\begin{equation}
+\label{eq:optimization_program}
+\min_{x_i \in \mathbb{R}^n} \frac{1}{2}\|M \vec x_i - b_i\|^2_2 + \lambda \| x_i\|_1
+\end{equation}
+
+\begin{theorem}
+\label{thm:candes}
+If the number of rows $l$ of our matrix $M$ verifies $l \geq C \mu s \log n$ (where $C$ and $s$ are constants), then with high probability and for the right choice of $\lambda$, the solution $\hat x$ to Eq.~\ref{eq:optimization_program} obeys:
+\begin{equation}
+\label{eq:first_guarantee}
+\| \hat x - x^* \|_2 \leq C (1 + \log^{3/2}n)\left[\frac{\|x^* - \tilde x\|_1}{\sqrt{s}} + \sigma \sqrt{\frac{s\log n}{l}}\right]
+\end{equation}
+where $\sigma$ is a constant dependent on the variance of $\epsilon$, and $\tilde x$ is the best s-sparse approximation to the ground truth $x^*$. In other words if node $i$ is of degree at most $s$, then $\|x^* - \tilde x\|_1 = 0$ since $x^*$ is $s-sparse$.\footnote{The result in the poster implicitly supposes that $x_i^*$ is $s$-sparse, i.e. that node $i$ has at most $s$ neighbors. The term $\|x^* - \tilde x\|_1=0$ and does not appear in the statement of the theorem.}
+\end{theorem}
+
+\paragraph{Discussion}
+\begin{itemize}
+\item Note how these results only hold for the voter model. For the independent cascade, our first unsucessful attempt at applying \cite{candestao} showed that its results could be extended to the non-linear case with different constants under the assumption that $\forall i,j, \ p_{j,i} < 1 - \eta$. Since the spirit of the proofs are very similar between the two papers, we hope that Theorem~\ref{thm:candes} can also be extended.
+\item The assumption $\|M^T \epsilon\|_{\infty} < 2.5 \sqrt{\log n}$ can most likely be shown to hold in our framework. Indeed, regardless of the normalization of $M$, $\epsilon$ will be of mean 0. Therefore, by a Chernoff-like bound followed by a union bound, $\|M^T \epsilon\|_{\infty}$ will be small with high probability.
+\item The more difficult assumption to adapt is $\mathbb{E}(m m^T) = I_n$. The authors mention that their results can be extended to cases where $\mathbb{E}(m m^T) \approx I_n$. Suppose that restrict ourselves to the first row of every cascade. In that case, the entries of our matrix are independent bernoulli variables equal to $1$ with probability $p_{\text{init}}$ and 0 otherwise. If we normalize our matrix with $1/\sqrt{p_\text{init}}$, then $\mathbb{E}(m' m'^T)$ is an $n \times n$ matrix with 1 on the diagonal on $p_{\text{init}}$ everywhere else, which is $\approx I_n$ if $p_{\text{init}}$ is small.
+\item The authors also mention that if the restricted isometry property (RIP) holds with $\delta < \frac{1}{4}$, then a similar theorem could easily be shown. The restriced isometry property characterizes a quasi-orthonormality of the measurement matrix M on sparse vectors: For all k, we define $\delta_k$ as the best possible constant such that for all unit-normed ($l$2) and k-sparse vectors $x$:
+
+\begin{equation}
+1-\delta_k \leq \|Mx\|^2_2 \leq 1 + \delta_k \nonumber
+\end{equation}
+
+In general, the smaller $\delta_k$ is, the better we can recover $k$-sparse vectors $x$. In the next section, we explore the $\delta$ constant of such a matrix, and show that in certain contexts it can be shown to be $\delta < \frac{1}{4}$.
+\end{itemize}
+
+\section{Experiments}
+\label{sec:exp}
+
+\subsection{Verifying the RIP constants}
+
+\paragraph{Motivation} Though it is not yet clear which normalization to choose for the columns of matrix $M$, nor whether we should include only the first step of every cascade or the cascade in its entirety, we ran simple experiments to show that the measurement matrix $M$ has `well-behaved' RIP constants. Indeed, the authors of \cite{candes} mention in their proofs that with an RIP strictly less than $\frac{1}{4}$, the assumptions of the theorem would hold, with slightly different constants. Due to a lack of time, we were unable to verify these claims or understand what the final statement of the theorem would be. In general, a measurement matrix with small RIP constants is known to recover sparse vectors well. These experiments should be considered as another reason to think that the sparse recovery framework is appropriate to the graph reconstruction problem.  
+
+\paragraph{In practice} Finding a non-trivial RIP constant is strongly NP-hard, and as such we were unable to test on networks with more than 25 nodes. To compute the $k-th$ RIP constant ($\delta_k$ with our notation), we must extract $k$ columns of $M$, and compute the following programs on the resulting matrices $(M_k)$ (there are $\binom{n}{k}$ of them!):
+
+\begin{table}
+\centering
+\begin{tabular}{c | c | c | c }
+c = 100 & c = 1000 & c = 100 & c = 1000 \\
+$p_\text{init}$ = .1 & $p_\text{init}$ = .1 & $p_\text{init}$ = .05 & $p_\text{init}$ = .05 \\
+\hline
+$\delta_4 = .54$ & $\delta_4 = .37$ & $\delta_4 = .43$ & $\delta_4 = .23$ \\
+\end{tabular}
+\caption{Results on dataset of 22 nodes extracted from the Karate club network. $p_\text{init}$ is the probability that a node is part of the seed set. $c$ is the number of cascades. $\delta_4$ is the RIP constant for $4$-sparse vectors. Cascades are generated using the independent cascade model.}
+\end{table}
+
+
+\begin{align}
+\label{eq:lower_bound}
+\min \qquad & \| M_k \vec x \|_2 \\
+\| \vec x\|_2 = 1 & \nonumber
+\end{align}
+\begin{align}
+\label{eq:upper_bound}
+\max \qquad & \| M_k \vec x \|_2 \\
+\| \vec x\|_2 = 1 & \nonumber
+\end{align}
+
+\paragraph{Discussion} Note how choosing the minimum of Eq~\ref{eq:lower_bound} over all extracted matrices returns the lowest possible $1 - \delta^-_k$ and choosing the maximum of Eq~\ref{eq:upper_bound} over all extracted matrices returns the highest possible $1 + \delta^+_k$. The smallest admissible $k-$RIP constant is therefore: $\max(\delta^+_k, \delta^-_k)$. Equations \ref{eq:lower_bound} and \ref{eq:upper_bound} can be solved efficiently since they correspond to the smallest and greatest eigenvalues of $M^TM$ respectively. 
+
+The results of our findings on a very small social network (a subset of the famous Karate club), show that as the number of cascades increase the RIP constants decrease and that if $p_\text{init}$ is small then the RIP constant decrease as well. Finally the constants we obtain are either under or close to the $.25$ mark set by the authors of \cite{candes}. By choosing appropriate parameters (number of cascades large enough, and $p_\text{init}$ small enough), we can place ourselves in the framework of this theorem for nodes of degree $4$ in a dataset of size 22.
+
+
+\subsection{Testing our algorithm}
+
+\begin{figure}
+\centering
+\label{fig:comparison-with-greedy}
+\includegraphics[scale=.35]{images/greedy_sparse_comparison.png}
+\caption{Plot of the F1 score for different number of cascades for both the \textsc{greedy} algorithm and Algorithm~\ref{eq:optimization_program}. The cascades, graphs, and edge probabilities were identical for both algorithms. The dataset is a subset of 333 nodes and 5039 edges taken from the Facebook graph, made available by \cite{snap}. We chose $p_\text{init}=.05$, and the edge probability were chosen uniformly between $0$ and $0.8$. For Algorithm~\ref{eq:optimization_program}, we chose $\lambda=10$ and kept all edges with probability greater than $.1$ as true edges.}
+\end{figure}
+
+We were only able to test our algorithm in the case of the independent cascade model and compare it to the \textsc{greedy} baseline suggested in \cite{netrapalli}. We picked a small subset of 333 nodes and 5039 edges from the Facebook graph, made available by \cite{snap}. As we can see from Figure~\ref{fig:comparison-with-greedy}, our algorithm outperforms considerably the \textsc{greedy} baseline. Of things to take into consideration are the fact that the graph is very small, that the \textsc{greedy} algorithm has no guarantee for non-tree networks\footnote{Their authors do mention that it performs well in practice on non-tree networks as well}, and that \textsc{greedy} is a very simple heuristic. To get an accurate picture of our algorithm's contribution, it would be necessary to compare it to every other algorithm suggested in \cite{netrapalli} and \cite{abrahao}.
+
+
+\section{Conclusion}
+
+In conclusion, we described how a well-studied framework could be applied to the graph reconstruction problem. We give arguments as to why applying this framework could lead to better guarantees than pre-existing results. We also test our model in practice and show that it performs better than a simple heuristic, validating our approach to a certain extent.
+
+\begin{thebibliography}{42}
+
+\bibitem{gomez}
+Gomez-Rodriguez, M., Leskovec, J., and Krause, A.
+\newblock {\it Inferring Networks of Diffusion and Influence}
+\newblock In Proc. 16th ACM SIGKDD international conference on Knowledge discovery and data mining, KDD'10, pages 1019--1028,
+\newblock 2010
+
+\bibitem{netrapalli}
+Netrapalli, P., and Sanghavi, S.,
+\newblock {\it Learning the Graph of Epidemic Cascades}
+\newblock SIGMETRICS Perform. Eval. Rev., 40(1): 211--222,
+\newblock 2012
+
+
+\bibitem{abrahao}
+Abrahao, B., Chierichetti, F., Kleinberg, R., and Panconesi, A.
+\newblock{\it Trace Complexity of Network Inference}
+\newblock In Proc. 19th ACM SIGKDD international conference on Knowledge discovery and data mingin, KDD'13, pages 491--499,
+\newblock 2013
+
+\bibitem{candestao}
+Candès, E. J., Romberg, J. K., and Tao, T.
+\newblock {\it Stable Signal Recovery from Incomplete and Inaccurate Measurements},
+\newblock Comm. Pure Appl. Math., 59: 1207--1223,
+\newblock 2006,
+
+\bibitem{candes}
+Candès, E., and Plan, Y.
+\newblock {\it A Probabilistic and RIPless Theory of Compressed Sensing}
+\newblock Information Theory, IEEE Transactions on, 57(11): 7235--7254,
+\newblock 2011.
+
+\bibitem{snap}
+Leskovec, J., and Krevl, A.,
+\newblock {\it SNAP Datasets: Stanford Large Network Dataset Collection},
+\newblock 2014.
+
+
+\end{thebibliography}
+
+\end{document}
\ No newline at end of file
diff --git a/old_work/sparse.bib b/old_work/sparse.bib
new file mode 100644
index 0000000..969e45d
--- /dev/null
+++ b/old_work/sparse.bib
@@ -0,0 +1,105 @@
+@article {CandesRomberTao:2006,
+author = {Candès, Emmanuel J. and Romberg, Justin K. and Tao, Terence},
+title = {Stable signal recovery from incomplete and inaccurate measurements},
+journal = {Communications on Pure and Applied Mathematics},
+volume = {59},
+number = {8},
+publisher = {Wiley Subscription Services, Inc., A Wiley Company},
+issn = {1097-0312},
+pages = {1207--1223},
+year = {2006},
+}
+
+
+@inproceedings{GomezRodriguez:2010,
+ author = {Gomez Rodriguez, Manuel and Leskovec, Jure and Krause, Andreas},
+ title = {Inferring Networks of Diffusion and Influence},
+ booktitle = {Proceedings of the 16th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
+ series = {KDD '10},
+ year = {2010},
+ isbn = {978-1-4503-0055-1},
+ location = {Washington, DC, USA},
+ pages = {1019--1028},
+ numpages = {10},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+} 
+
+
+@article{Netrapalli:2012,
+ author = {Netrapalli, Praneeth and Sanghavi, Sujay},
+ title = {Learning the Graph of Epidemic Cascades},
+ journal = {SIGMETRICS Perform. Eval. Rev.},
+ volume = {40},
+ number = {1},
+ month = {June},
+ year = {2012},
+ issn = {0163-5999},
+ pages = {211--222},
+ numpages = {12},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {cascades, epidemics, graph structure learning},
+}
+
+@article{Negahban:2009,
+ author = {Negahban, Sahand N. and Ravikumar, Pradeep and Wrainwright, Martin J. and Yu, Bin},
+ title = {A Unified Framework for High-Dimensional Analysis of M-Estimators with Decomposable Regularizers},
+ Journal = {Statistical Science},
+ year = {2012},
+ month = {December},
+ volume = {27},
+ number = {4},
+ pages = {538--557},
+}
+
+@article{Zhao:2006,
+ author = {Zhao, Peng and Yu, Bin},
+ title = {On Model Selection Consistency of Lasso},
+ journal = {J. Mach. Learn. Res.},
+ issue_date = {12/1/2006},
+ volume = {7},
+ month = dec,
+ year = {2006},
+ issn = {1532-4435},
+ pages = {2541--2563},
+ numpages = {23},
+ url = {http://dl.acm.org/citation.cfm?id=1248547.1248637},
+ acmid = {1248637},
+ publisher = {JMLR.org},
+} 
+
+@inproceedings{Daneshmand:2014,
+  author    = {Hadi Daneshmand and
+               Manuel Gomez{-}Rodriguez and
+               Le Song and
+               Bernhard Sch{\"{o}}lkopf},
+  title     = {Estimating Diffusion Network Structures: Recovery Conditions, Sample
+               Complexity {\&} Soft-thresholding Algorithm},
+  booktitle = {Proceedings of the 31th International Conference on Machine Learning,
+               {ICML} 2014, Beijing, China, 21-26 June 2014},
+  pages     = {793--801},
+  year      = {2014},
+  crossref  = {DBLP:conf/icml/2014},
+  url       = {http://jmlr.org/proceedings/papers/v32/daneshmand14.html},
+  timestamp = {Fri, 07 Nov 2014 20:42:30 +0100},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/conf/icml/DaneshmandGSS14},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+\bibitem{abrahao}
+Abrahao, B., Chierichetti, F., Kleinberg, R., and Panconesi, A.
+\newblock{\it Trace Complexity of Network Inference}
+\newblock In Proc. 19th ACM SIGKDD international conference on Knowledge discovery and data mingin, KDD'13, pages 491--499,
+\newblock 2013
+
+\bibitem{candes}
+Candès, E., and Plan, Y.
+\newblock {\it A Probabilistic and RIPless Theory of Compressed Sensing}
+\newblock Information Theory, IEEE Transactions on, 57(11): 7235--7254,
+\newblock 2011.
+
+\bibitem{snap}
+Leskovec, J., and Krevl, A.,
+\newblock {\it SNAP Datasets: Stanford Large Network Dataset Collection},
+\newblock 2014.
\ No newline at end of file
-- 
cgit v1.3.1