\documentclass[10pt]{article}
\usepackage{fullpage, amsmath, amsthm, amssymb}
\usepackage{graphicx, algorithm, algpseudocode}
\usepackage[utf8x]{inputenc}
\newtheorem{theorem}{Theorem}

\title{Sparse Recovery for Graph Reconstruction}
\author{Eric Balkanski \and Jean Pouget-Abadie}

\date{}

\begin{document}

\maketitle

\section{Introduction}

Given a set of observed cascades, the \textbf{graph reconstruction problem} consists of finding the underlying graph on which these cascades spread. We explore different models for cascade creation, namely the {\bf voter model} and the \textbf{independent cascade model}.

\section{Related Work}

In previous work, this problem has been formulated in different ways, including a convex optimization and a maximum likelihood problem. However, there is no known algorithm for graph reconstruction with theoretical guarantees and with a reasonable required sample size.

\section{The Voter Model}

\subsection{Description}
In the voter model, there are two types of nodes, \textsc{red} and \textsc{blue}. We fix a horizon $T > 0$. At $t=0$, we decide on a set of \textsc{blue} nodes, called the seed set. Every other node is colored $\textsc{red}$. At each time step, each node $u$ chooses one of its neighbors uniformly, with probability $1/deg(u)$, and adopts the color of that neighbor. A cascade in the voter model can be described as the evolution of which nodes are \textsc{blue} for each time step $0\leq t\leq T$.

Several variants of the model can be formulated, namely whether or not nodes have self-loops and maintain their color with a certain probability or whether the probability that a node is part of a seed set is uniform versus non-uniform, independent versus correlated. For simplicity, we suppose here that nodes do not have self-loops and that the probability that a node is part of the seed set is indepedent and equal to $1/p_{\text{init}}$ for $0 < p_{\text{init}} < 1$.

\subsection{Formulation}

We are going to graph reconstruction problem from voter-model type cascades as a sparse recovery problem. We introduce the following notation:

\paragraph{Notation}
Denote by $(\vec m^t_k)_{1 \leq k \leq m; 1 \leq t \leq T}$ the set of blue nodes in cascade $k$ at time step $t$. In other words, $\vec m^t_k$ is a vector in $\mathbb{R}^n$ such that:

\[ (\vec m^t_k)_j = \left\{ 
  \begin{array}{l l}
    1 & \quad \text{if $j$ is \textsc{blue} in cascade $k$ at time step $t$}\\
    0 & \quad \text{otherwise}
  \end{array} \right.\]

Let $\vec x_i$ be the vector representing the neighbors of a node $i$ in the graph ${\cal G}$. In other words, $\vec x_i$ is a vector in $\mathbb{R}^n$ such that:

\[ (\vec x_{i})_j = \left\{ 
  \begin{array}{l l}
    1/\text{deg(i)} & \quad \text{if $j$ is a parent of $i$ in ${\cal G}$}\\
    0 & \quad \text{otherwise}
  \end{array} \right.\]

Let $w^t_{k,i}$ be the scalar representing the probability that in cascade $k$ at time step $t$ node $i$ stays or becomes \textsc{blue}.

\[ w^t_{k,i} = \left\{ 
  \begin{array}{l l}
    \frac{\text{Number of \textsc{blue} neighbors of i in $(k, t-1)$}}{\text{deg}(i)} & \quad \text{if $t\neq 0$}\\
    1/p_{\text{init}} & \quad \text{otherwise}
  \end{array} \right.\]

Our objective is to recover the neighbors for each node $i$ in ${\cal G}$. This corresponds to recovering the vectors $(\vec x_i)_{i \in {\cal G}}$. Notice that for each cascade $k$ and every time step $t < T - 1$, we have:

\begin{equation}
\langle \vec m_k^t, \vec x_i \rangle = \mathbb{P}(\text{node $i$ is \textsc{blue} in cascade k at $t+1$}) = w_{k,i}^{t+1} \nonumber
\end{equation}

We can concatenate each equality in matrix form $M$, such that the rows of $M$ are the row vectors $(\vec m^t_k)^T$, and for each node $i$, the entries of $\vec w_i$ are the corresponding probabilities:

$$M \vec x_i = \vec w_i$$

Note that if $M$ had full column rank, then we could recover $\vec x_i$ from $\vec b$. This is however an unreasonable assumption, even after having observed many cascades. We must explore which assumptions on $M$ allow us to recover $\vec x_i$ when the number of observed cascades is small. 

Further note that we do not immediately observe $\vec w_i$. Instead, we observe a bernoulli vector, whose j$^{th}$ entry is equal to $1$ with probability $(\vec w_i)_j$ and $0$ otherwise, corresponding to whether at the next time step $i$ became or stayed \textsc{blue}. We will denote this vector $\vec b_i$, and denote by $\vec \epsilon_i$ the zero-mean noise vector such that:

$$\vec w_i = \vec b_i + \vec \epsilon_i$$ 

For each node $i$, we can therefore reformulate the problem with our observables:

\begin{equation}
\label{eq:sparse_voter}
M \vec x_i = \vec b_i + \epsilon_i
\end{equation}

If the vector $\vec x_i$ is sufficiently sparse, i.e. node $i$ has sufficiently few neighbors, then we hope to apply common sparse recovery techniques to solve for $\vec x_i$.

\subsection{Example}


\begin{figure}
\centering
\includegraphics[width=0.3\textwidth]{images/voter.png}
\caption{A cascade in the voter model with time steps $t = 0,1,2$  over a graph with 5 vertices}
\end{figure}


To recover the neighbors of $v_1$, we get the following matrix $M$ for the example in Figure 1:

\begin{align*}
&\hspace{0.35cm} v_2 \hspace{0.2 cm}  v_3 \hspace{0.2 cm} v_4 \hspace{0.2 cm} v_5 &\\
\vspace{1 cm}
M  = &  \left( \begin{array}{cccc}
0 & 0 & 1 & 1 \\
1 & 1 & 0 & 0 \\
\end{array} \right)  &  \begin{array}{l} \hspace{ -4.5 cm} 
\text{time step 0} \\
 \hspace{ - 4.5cm}  \text{time step 1} \\
\end{array}
\end{align*}

and the vector $b_1$:

\begin{align*}
b_1  = &  \left( \begin{array}{c}
1 \\
1 \\
\end{array} \right) &  \begin{array}{l} \hspace{ - 5cm} 
\text{time step 1} \\
 \hspace{ - 5cm}  \text{time step 2} \\
 \end{array}
\end{align*}

\section{The Independent Cascade Model}

\subsection{Description}

In the independent cascade model, nodes have three possible states: susceptible, infected or active\footnote{The words `infected' and `active' are used interchangeably}, and inactive. All nodes start either as susceptible or infected. The infected nodes at $t=0$ constitute the seed set. Similarly to the voter model, we will suppose that nodes are part of the seed set with independent probability $p_{\text{init}}$. At each time step $t$, for every pair of nodes $(j,i)$ where $j$ is infected and $i$ is susceptible, $j$ attempts to infect $i$ with probability $p_{j,i}$. If $j$ succeeds, $i$ will become active at time step $t+1$. Regardless of $j$'s success, node $j$ will be inactive at time $t+1$: nodes stay active for only one time step. The cascade continues until time step $T_k$ when no active nodes remain.

\subsection{Formulation}

We are going to graph reconstruction problem from independent-cascade-model-type cascades as a sparse recovery problem. We introduce the following notation:

\paragraph{Notation}
We adapt slightly the definition of $(\vec m^t_k)$:

\[ (\vec m^t_k)_j = \left\{ 
  \begin{array}{l l}
    1 & \quad \text{if $j$ is \textsc{active} in cascade $k$ at time step $t$}\\
    0 & \quad \text{otherwise}
  \end{array} \right.\]

Similarly to the voter model, if $\vec p_i$ is the vector representing the `parents' of node $i$ in the graph ${\cal G}$ such that $(\vec p_i)_j$ is the probability that node $j$ infects $i$, then $\vec \theta_i$ is the vector:

\[ (\vec \theta_{i})_j = \left\{ 
  \begin{array}{l l}
    \log ( 1- p_{j,i}) & \quad \text{if $j$ is a parent of $i$ in ${\cal G}$}\\
    0 & \quad \text{otherwise}
  \end{array} \right.\]

Note that if $p_{j,i} = 0$, then $\theta_{j,i} = 0$ and we can write $\vec \theta = \log (1 - \vec p)$ element-wise. For $t>0$, let $w^t_{k,i}$ be the scalar representing the probability that in cascade $k$ at time step $t$ node $i$ becomes \textsc{active}, conditioned on the state of cascade  $k$ at time step $t-1$. As long as node $i$ has not yet been infected in cascade $k$, we can write as in the voter model, we can observe that: 

$$\langle \vec m_k^t, \vec \theta_i \rangle = \vec w^t_{k,i}$$

As in the voter model, by concatenating all row vectors $\vec m^t_{k,i}$ such that $t < t^k_i$, where $t^k_i$ is the infection of node $i$ in cascade $k$, in the matrix $M_i$, we can write:

$$M_i \vec \theta_i = \vec w_i$$

Once again, we do not have access to $\vec w_i$. Instead, for $t<T_k$, we have access to the bernoulli variable $b_i$ of parameter $1 - e^{w_i}$, indicating whether or not node $i$ became active at time step $t+1$ in cascade $k$. Recovering $\vec \theta_i$ for every node $i$ can be written as the following problem:

\begin{equation}
e^{M_i \vec \theta_i} = 1 - \vec b_i + \vec \epsilon_i
\end{equation}

Note that this is not exactly a sparse recovery model due to the non-linear exponential term. It is of the author's opinion however that if the probabilities $p_{j,i}$ are restrained to a bounded interval $[0, 1- \eta]$, then most of the results which hold for the linear voter model will continue to hold in this case.

\subsection{Example}

\begin{figure}
\centering
\includegraphics[width=0.3\textwidth]{images/icc.png}
\caption{A cascade in the independent cascade model with time steps $t = 0,1,2$  over a graph with 5 vertices}
\end{figure}


To recover the neighbors of $v_5$, we get the following matrix $M$ for the example in Figure 2:
\begin{align*}
&v_1 \hspace{0.2 cm}  v_2 \hspace{0.2 cm} v_3 \hspace{0.2 cm} v_4 \\
\vspace{1 cm}
M  = &  \left( \begin{array}{cccc}
1 & 0 & 0 & 0 \\
0 & 1 & 1 & 0 \\
\end{array} \right)   \begin{array}{l} \hspace{ 1cm} 
\text{time step 0} \\
 \hspace{ 1cm}  \text{time step 1} \\
 \end{array}
\end{align*}

and the vector $b_5$:

\begin{align*}
b_5  = &  \left( \begin{array}{c}
0 \\
1 \\
\end{array} \right)  \begin{array}{l} \hspace{ 1cm} 
\text{time step 1} \\
 \hspace{ 1cm}  \text{time step 2} \\
 \end{array}
\end{align*}


\section{Sparse Recovery}

\subsection{Introduction}
The literature on sparse recovery has expanded rapidly since a series of papers were published by Candès, Tao and Romberg in 2006. In 8 years, over 7000 articles have been written on the topic since these few seminal publications. Many of them show that if the matrix $M$ is `well-conditioned', then any $s$-sparse signal can be recovered from ${\cal O}(s \log n)$ measurements (a row in the matrix $M$). We explored several of these papers to understand which conditions on $M$ were most reasonable.

\subsection{Exploration}
\paragraph{Disclosure}
After an unsatisfactory attempt at adapting \cite{candestao}, we cite the main results of \cite{candestao} and give arguments as to why we believe, with more work, these assumptions can be made reasonable in the case our model. Due to the limited time frame of this project, we were unable to find a complete and rigorous argumentation as to why these assumptions hold. The following paragraphs should be understood as motivation for this line of research rather than concrete guarantees.

\paragraph{Framework}
Suppose that each row $m$ of $M$ is taken from a distribution ${\cal F}$, such that $\mathbb{E}(m m^T) = I_n$. Suppose further that $\|m\|_{\infty} \leq \mu$ and that $\|M^T \epsilon\|_{\infty} < 2.5 \sqrt{\log n}$. For every node $i$, consider the following optimization program:

\begin{equation}
\label{eq:optimization_program}
\min_{x_i \in \mathbb{R}^n} \frac{1}{2}\|M \vec x_i - b_i\|^2_2 + \lambda \| x_i\|_1
\end{equation}

\begin{theorem}
\label{thm:candes}
If the number of rows $l$ of our matrix $M$ verifies $l \geq C \mu s \log n$ (where $C$ and $s$ are constants), then with high probability and for the right choice of $\lambda$, the solution $\hat x$ to Eq.~\ref{eq:optimization_program} obeys:
\begin{equation}
\label{eq:first_guarantee}
\| \hat x - x^* \|_2 \leq C (1 + \log^{3/2}n)\left[\frac{\|x^* - \tilde x\|_1}{\sqrt{s}} + \sigma \sqrt{\frac{s\log n}{l}}\right]
\end{equation}
where $\sigma$ is a constant dependent on the variance of $\epsilon$, and $\tilde x$ is the best s-sparse approximation to the ground truth $x^*$. In other words if node $i$ is of degree at most $s$, then $\|x^* - \tilde x\|_1 = 0$ since $x^*$ is $s-sparse$.
\end{theorem}

\paragraph{Discussion}
\begin{itemize}
\item Note how these results only hold for the voter model. For the independent cascade, our first unsucessful attempt at applying \cite{candestao} showed that its results could be extended to the non-linear case with different constants under the assumption that $\forall i,j, \ p_{j,i} < 1 - \eta$. Since the spirit of the proofs are very similar between the two papers, we hope that Theorem~\ref{thm:candes} can also be extended.
\item The assumption $\|M^T \epsilon\|_{\infty} < 2.5 \sqrt{\log n}$ can most likely be shown to hold in our framework. Indeed, regardless of the normalization of $M$, $\epsilon$ will be of mean 0. Therefore, by a Chernoff-like bound followed by a union bound, $\|M^T \epsilon\|_{\infty}$ will be small with high probability.
\item The more difficult assumption to adapt is $\mathbb{E}(m m^T) = I_n$. The authors mention that their results can be extended to cases where $\mathbb{E}(m m^T) \approx I_n$. Suppose that restrict ourselves to the first row of every cascade. In that case, the entries of our matrix are independent bernoulli variables equal to $1$ with probability $p_{\text{init}}$ and 0 otherwise. If we normalize our matrix with $1/\sqrt{p_\text{init}}$, then $\mathbb{E}(m' m'^T)$ is an $n \times n$ matrix with 1 on the diagonal on $p_{\text{init}}$ everywhere else, which is $\approx I_n$ if $p_{\text{init}}$ is small.
\item The authors also mention that if the restricted isometry property (RIP) holds with $\delta < \frac{1}{4}$, then a similar theorem could easily be shown. The restriced isometry property characterizes a quasi-orthonormality of the measurement matrix M on sparse vectors: For all k, we define $\delta_k$ as the best possible constant such that for all unit-normed ($l$2) and k-sparse vectors $x$:

\begin{equation}
1-\delta_k \leq \|Mx\|^2_2 \leq 1 + \delta_k \nonumber
\end{equation}

In general, the smaller $\delta_k$ is, the better we can recover $k$-sparse vectors $x$. In the next section, we explore the $\delta$ constant of such a matrix, and show that in certain contexts it can be shown to be $\delta < \frac{1}{4}$.
% \item Finally, though it is not yet clear which normalization to choose for the matrix $M$, it is to be noted that for our ultimate goal of support recovery, (i.e. detecting whether an edge exists rather), the literature seems to suggest that the precise normalization seems not to be important. Furthermore, normalizing the matrix by a constant does not modify our results (other than modifying the $\mu$ constant in Theorem~\ref{thm:candes}).
\end{itemize}

\section{Experiments}

\subsection{Verifying the RIP constants}

Though it is not yet clear which normalization to choose for the columns of matrix $M$, nor whether we should include only the first step of every cascade or the cascade in its entirety, we ran simple experiments to show that the measurement matrix $M$ has `well-behaved' RIP constants.

\begin{table}
\centering
\begin{tabular}{c | c | c | c }
c = 100 & c = 1000 & c = 100 & c = 1000 \\
$p_\text{init}$ = .1 & $p_\text{init}$ = .1 & $p_\text{init}$ = .05 & $p_\text{init}$ = .05 \\
\hline
$\delta_4 = .54$ & $\delta_4 = .37$ & $\delta_4 = .43$ & $\delta_4 = .23$ \\
\end{tabular}
\caption{Results on dataset of 22 nodes extracted from the Karate club network. $p_\text{init}$ is the probability that a node is part of the seed set. $c$ is the number of cascades. $\delta_4$ is the RIP constant for $4$-sparse vectors. Cascades are generated using the independent cascade model.}
\end{table}

Finding a non-trivial RIP constant is strongly NP-hard, and as such we were unable to test on networks with more than 25 nodes. To compute the $k-th$ RIP constant ($\delta_k$ with our notation), we must extract $k$ columns of $M$, and compute the following programs on the resulting matrices $(M_k)$ (there are $\binom{n}{k}$ of them!):
\begin{align}
\label{eq:lower_bound}
\min \qquad & \| M_k \vec x \|_2 \\
\| \vec x\|_2 = 1 & \nonumber
\end{align}
\begin{align}
\label{eq:upper_bound}
\max \qquad & \| M_k \vec x \|_2 \\
\| \vec x\|_2 = 1 & \nonumber
\end{align}

Note how choosing the minimum of Eq~\ref{eq:lower_bound} over all extracted matrices returns the lowest possible $1 - \delta^-_k$ and choosing the maximum of Eq~\ref{eq:upper_bound} over all extracted matrices returns the highest possible $1 + \delta^+_k$. The smallest admissible $k-$RIP constant is therefore: $\max(\delta^+_k, \delta^-_k)$. Equations \ref{eq:lower_bound} and \ref{eq:upper_bound} can be solved efficiently since they correspond to the smallest and greatest eigenvalues of $M^TM$ respectively.

In \cite{candes}, 

\subsection{Testing our algorithm}


\section{Conclusion}


\begin{thebibliography}{42}

\bibitem{candestao}
Candès, E. J., Romberg, J. K., and Tao, T.
\newblock {\it Stable Signal Recovery from Incomplete and Inaccurate Measurements},
\newblock Comm. Pure Appl. Math., 59: 1207--1223,
\newblock 2006,

\bibitem{candes}
Candès, E., and Plan, Y.
\newblock {\it A Probabilistic and RIPless Theory of Compressed Sensing}
\newblock Information Theory, IEEE Transactions on, 57(11): 7235--7254,
\newblock 2011.
\end{thebibliography}

\end{document}