\documentclass{sig-alternate-2013}
\pdfpagewidth=8.5in
\pdfpageheight=11in
\usepackage[utf8x]{inputenc}

\usepackage{amsmath, amsfonts, amssymb, bbm}
\usepackage{verbatim}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\ints}{\mathbb{N}}
\renewcommand{\O}{\mathcal{O}}
\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\inprod}[2]{#1 \cdot #2}
\newcommand{\defeq}{\equiv}
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{remark}{Remark}
\newtheorem{proposition}{Proposition}
\newtheorem{definition}{Definition}

\permission{Permission to make digital or hard copies of part or all of this
work for personal or classroom use is granted without fee provided that copies
are not made or distributed for profit or commercial advantage, and that copies
bear this notice and the full citation on the first page. Copyrights for
third-party components of this work must be honored. For all other uses,
contact the owner/author(s).  Copyright is held by the author/owner(s).}
\conferenceinfo{WWW 2015 Companion,}{May 18--22, 2015, Florence, Italy.}
\copyrightetc{ACM \the\acmcopyr}
\crdata{978-1-4503-3473-0/15/05. \\
http://dx.doi.org/10.1145/2740908.2744107}

\clubpenalty=10000
\widowpenalty = 10000

\title{Inferring Graphs from Cascades:\\ A Sparse Recovery Framework}
%\titlenote{A full version of this paper is available as \textit{Author's Guide to Preparing ACM SIG Proceedings Using \LaTeX$2_\epsilon$\ and BibTeX} at \texttt{www.acm.org/eaddress.htm}}}

\numberofauthors{2}
\author{
\alignauthor
Jean Pouget-Abadie\\
       \affaddr{Harvard University}\\
       \email{jeanpougetabadie@g.harvard.edu}
\alignauthor
Thibaut Horel\\
       \affaddr{Harvard University}\\
       \email{thorel@seas.harvard.edu}
}

\begin{document}

\maketitle

\begin{abstract}
In the Graph Inference problem, one seeks to recover the edges of an unknown
graph from the observations of cascades propagating over this graph.  We
approach this problem from the sparse recovery perspective.  We introduce
a general model of cascades, including the voter model and the independent
cascade model, for which we provide the first algorithm which recovers the
graph's edges with high probability and ${\cal O}(s\log m)$ measurements where
$s$ is the maximum degree of the graph and $m$ is the number of nodes.
Furthermore, we show that our algorithm also recovers the edge weights (the
parameters of the diffusion process) and is robust in the context of
approximate sparsity. Finally
%we prove an almost matching lower bound of $\Omega(s\log\frac{m}{s})$ and
we validate our approach empirically on synthetic graphs.
\end{abstract}

\category{I.2.6}{Artificial Intelligence}{Learning}[Parameter Learning]
%\terms{Theory, Performance, Measurements}
%\keywords{Graph Inference; Cascades; Sparse Recovery}

\section{Introduction}

A recent line of research has focused on the Graph Inference Problem:
recovering the directed edges of an unknown graph from the observations of a diffusion process propagating on this graph \cite{Abrahao:13, Daneshmand:2014,Netrapalli:2012}. For example, the Independent
Cascade Model, formalised in \cite{Kempe:03}, is a famous diffusion process where each ``infected'' node has a weighted probability to ``infect'' its neighbors in the graph. If we are only able to observe the time step at which nodes are infected over several diffusion processes, can we recover the edges and the edge weights of the graph?

Here, we propose a sparse recovery framework to not only solve the Graph
Inference Problem, but also recover the unknown weights of the diffusion
process, for a large class of discrete time diffusion processes. Recall that for every instance of the diffusion process, the only thing known to the observer are
the time steps at which the vertices in the graph become “infected” by the
diffusion process. The parallel with sparse recovery problems is as follows: for a given
vertex, the (unknown) “influence” of its
 parents in the graph is a \emph{signal}, that we observe through a series of \emph{measurements}, which are the instances of the diffusion process.  The two main challenges to apply sparse recovery tools to
this problem are: (1) contrary to a very common assumption, the measurements
given by a diffusion process are correlated (2) most diffusion processes lead
to non-linear sparse recovery problems.

In what follows, we first present a general class of discrete-time diffusion
processes which encompasses the famous Influence Cascade Model and the Voter Model. For this class of diffusion processes, despite the aforementioned
challenges, we show how to recover the unknown parameters with a convergence rate on par with rates observed in the sparse recovery literature.
Finally, we validate our approach experimentally, by
comparing its performance to prior algorithms on synthetic data.
% a convergence
% rate $O(s\log m/\sqrt{n})$ where $s$ is the in-degree of a given vertex, $m$ is
% the number of vertices in the graph and $n$ is the number of observations. In
% particular, given $\Omega(s\log m)$ measurements, we are able to recover the
% edges of the graph.
\vspace{-.2cm}
\section{Model}

\begin{figure}
    \begin{center}
\includegraphics[scale=.4]{../presentation/images/drawing.pdf}
\caption{Illustration of the sparse recovery framework. $\theta_j$ is the
unknown weight vector, $b_j$ is the result of the sparse product $\theta_j
\cdot X^t$. We observe bernoulli variables {\cal B}($f(b)$). }
\end{center}
\vspace{-2em}
\end{figure}


Let consider a graph $G = (V, E)$ with $|V|= m$ and where the set of edges $E$
is unknown. For a given vertex $j$, the cascade model is parameterized by
a vector $\theta_j\in\reals^m$ where the $i$-th coordinate of $\theta_j$
captures the “influence” of vertex $i$ on $j$. This influence is 0 if $i$ is
not a parent of $j$.

A cascade is characterized by the propagation of a “contagious” state in
discrete time steps. Initially, each vertex is contagious with probability
$p_{\text{init}}$. Let us denote by $X^0$ the indicator vector of the initially
contagious vertices.
Denoting by $X^t$ the indicator vector of the set of
vertices which are contagious at time step $t$, the probability that $j$ will
be contagious at time $t+1$ is given by:
\begin{equation}
    \label{eq:glm}
    \mathbb{P}(X^{t+1}_j = 1|X^t) 
    = f(\theta_j \cdot X^t)
\end{equation}
where $f: \reals \rightarrow [0,1]$. Conditioned on $X^t$, this
evolution happens independently for each vertex at time $t+1$. We show below that both the independent cascade model and the voter model can be cast in this framework. \newline

\noindent{\bf Independent Cascade Model} Considering the discrete-time IC model, the probability that a susceptible node $j$ becomes infected at the next time step is given by:

\begin{displaymath}
    \P\big[X^{t+1}_j = 1\,|\, X^{t}\big]
    = 1 - \prod_{i = 1}^m (1 - p_{i,j})^{X^t_i}.
\end{displaymath}
Defining $\Theta_{i,j} \defeq \log(1-p_{i,j})$, this can be rewritten as:
\begin{equation}\label{eq:ic}
    \tag{IC}
    \P\big[X^{t+1}_j = 1\,|\, X^{t}\big]
    = 1 - \prod_{i = 1}^m e^{\Theta_{i,j}X^t_i}
    = 1 - e^{\inprod{\theta_j}{X^t}}
\end{equation}
Therefore, the independent cascade model fits into the previous framework with $f : z \mapsto 1 - e^z$. \newline 

\noindent {\bf Voter Model} Here, nodes can be either \emph{red} or \emph{blue}. Each round, every node $j$ independently chooses
one of its neighbors with probability $\Theta_{i,j}$ and adopts their color. Without loss of generality, we can suppose that being
\emph{blue} is the contagious state.
The
cascades stops at a fixed horizon time $T$ or if all nodes are of the same color.
If we denote by $X^t$ the indicator variable of the set of blue nodes at time
step $t$, then we have:
\begin{equation}
\mathbb{P}\left[X^{t+1}_j = 1 | X^t \right] = \sum_{i=1}^m \Theta_{i,j} X_i^t = \inprod{\Theta_j}{X^t}
\tag{V}
\end{equation}

Thus, the linear voter model fits into the previous framework with $f: z \mapsto z$.


\section{Results}

For a given vertex $i$, we are given a set of measurements, $(X^t,
X^{t+1}_i)_{t\in\mathcal{T}_i}$ generated from \eqref{eq:glm}. We estimate
$\theta_i$ via $\ell_1$-regularized maximum likelihood estimation:
\begin{equation}\label{eq:pre-mle}
    \hat{\theta}_i \in \argmax_{\theta} \mathcal{L}_i(\theta_i\,|\,x^1,\ldots,x^n)
    - \lambda\|\theta_i\|_1
\end{equation}
where $\mathcal{L}_i$ is the log-likelihood of $\theta_i$ given the
observations. 
% :
% \begin{multline*}
%     \mathcal{L}_i(\theta_i\,|\,x^1,\ldots,x^n) = \frac{1}{|{\cal T}_i|}
%     \sum_{t\in {\cal T}_i } x_i^{t+1}\log f(\inprod{\theta_i}{x^{t}}) \\
%     + (1 - x_i^{t+1})\log\big(1-f(\inprod{\theta_i}{x^t})\big)
% \end{multline*}
We will need the following assumptions:

\begin{enumerate}
    \item $\log f$ and $\log (1-f)$ are concave functions.
    \item $\log f$ and $\log (1-f)$ have derivatives bounded in absolute value
        by $\frac{1}{\alpha}$ for some $\alpha > 0$.
    \item denoting by $S$ the support of the true vector of parameters
        $\theta_i^*$, define $\mathcal{C}(S)\defeq
    \{X\in\reals^m\,:\,\|X\|_1\leq 1\text{ and } \|X_{S^c}\|_1\leq
    3\|X_S\|_1\}$. We assume that:
\begin{equation*}
    \forall X \in {\cal C(S)}, X^T \nabla^2\mathcal{L}_i(\theta_i^*)X
    \geq \gamma \|X\|_2^2 
\end{equation*}
for some $\gamma>0$. $\gamma$ is called the \emph{restricted eigenvalue}.
\end{enumerate}

Adapting a result from \cite{Negahban:2009}, we obtain the following finite-sample guarantee:

\begin{theorem}
\label{thm:main}
Assume 1. to 3. above and define $n\defeq |\mathcal{T}_i|$. For any
$\delta\in(0,1)$, let $\hat{\theta_i}$ be the solution of \eqref{eq:pre-mle}
with $\lambda \defeq 2\sqrt{\frac{\log m}{\alpha n^{1 - \delta}}}$, then:
\begin{equation}
    \label{eq:sparse}
    \|\hat \theta_i - \theta^*_i \|_2
    \leq \frac{6}{\gamma} \sqrt{\frac{s \log m}{\alpha n^{1-\delta}}}
\quad
\text{w.p.}\;1-\frac{1}{e^{n^\delta \log m}}
\end{equation}
\end{theorem}

Assumption 3. above can be replaced by the following data-independent
assumption:
\begin{enumerate}
    \item[3.] $\log f$ and $\log (1-f)$ are $\varepsilon$-concave and the
        expected gram matrix $\lim_{n\rightarrow\infty}\frac{1}{n}X^TX$ has
        a smallest ``restricted'' eigenvalue bounded from below by $\gamma>0$, where $X$ is
        the $n\times m$ design matrix whose $k$-th row is $X^k$.
\end{enumerate}
provided that either $\Omega(s^2 \log m)$ cumulative time steps are observed or
$\Omega(s \log m \log^3 (s \log m))$ distinct instances of the diffusion
process (cascades) are observed.

\section{Experiments}

We compared the performance of Algorithm~\eqref{eq:pre-mle} to prior algorithms
for the Graph Inference problem. Given our estimate $\tilde{\Theta}$ of the
edge weights, we recover the edges of the graph by simple thresholding: $E
= \cup_{j \in V} \{(i,j) : \tilde{\Theta}_{ij} > \eta\}$, for varying values
of $\eta$. We used the F1-score as a measure of performance: $\text{F1}=2
\text{precision}\cdot\text{recall}/(\text{precision}+\text{recall})$.

The algorithms were tested on several synthetic networks generated from
standard social networks model. The results are shown in Figure~\ref{fig:exp}
for the Watts-Strogatz model. The full version of the paper contains more
comprehensive experiments.

% which
% considers \emph{(1)} the number of true edges recovered by the algorithm over
% the total number of edges returned by the algorithm (\emph{precision}) and
% \emph{(2)} the number of true edges recovered by the algorithm over the total
% number of edges it should have recovered (\emph{recall}).

\begin{figure}
    \label{fig:exp}
\centering
\includegraphics[scale=.35]{../paper/figures/watts_strogatz.pdf}
\caption{F1 score as a function of the number of observed cascades for
a Watts-Strogatz graph, for the Greedy and MLE algorithm from
\cite{Netrapalli:2012}, a Lasso algorithm which approximates \eqref{eq:pre-mle},
and the penalized log-likelihood program \eqref{eq:pre-mle}.}
\end{figure}
%\section{Acknowledgments}

\bibliographystyle{abbrv}
\bibliography{sparse}

%\balancecolumns
\end{document}