aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJean Pouget-Abadie <jean.pougetabadie@gmail.com>2015-04-30 11:40:17 -0400
committerJean Pouget-Abadie <jean.pougetabadie@gmail.com>2015-04-30 11:40:17 -0400
commit13f312ce5b1201b70375d7cffebc813b25cb35fa (patch)
tree7b0cbf1d1675d934ef30e098380b8baad0f1205d
parent7fd038c7b0f00f6db411e9c5037133fd09509a8d (diff)
downloadcascades-13f312ce5b1201b70375d7cffebc813b25cb35fa.tar.gz
added more comments from reviewers
-rw-r--r--paper/sections/experiments.tex105
-rw-r--r--paper/sections/intro.tex23
-rw-r--r--paper/sections/model.tex11
-rw-r--r--paper/sections/results.tex91
4 files changed, 165 insertions, 65 deletions
diff --git a/paper/sections/experiments.tex b/paper/sections/experiments.tex
index e641992..b60fe44 100644
--- a/paper/sections/experiments.tex
+++ b/paper/sections/experiments.tex
@@ -5,6 +5,10 @@
% & \includegraphics[scale=.23]{figures/kronecker_l2_norm.pdf}
% & \includegraphics[scale=.23]{figures/kronecker_l2_norm_nonsparse.pdf}\\
+{\color{red} TODO:~add running time analysis, theoretical bounds on tested
+graphs, nbr of measurements vs.~number of cascades. One common metric for all
+types of graphs (possibly the least impressive improvement)}
+
\begin{table*}[t]
\centering
\begin{tabular}{l l l}
@@ -12,46 +16,99 @@
& \hspace{-1em}\includegraphics[scale=.28]{figures/watts_strogatz.pdf}
& \hspace{-3em}\includegraphics[scale=.28]{figures/ROC_curve.pdf} \\
-\hspace{-0.5em}(a) Barabasi-Albert (F$1$ \emph{vs.} $n$)
-&\hspace{-1em} (b) Watts-Strogatz (F$1$ \emph{vs.} $n$)
+\hspace{-0.5em} (a) Barabasi-Albert (F$1$ \emph{vs.} $n$)
+&\hspace{-1em} (b) Watts-Strogatz (F$1$ \emph{vs.} $n$)
&\hspace{-3em} (c) Holme-Kim (Prec-Recall) \\
\hspace{-0.5em}\includegraphics[scale=.28]{figures/kronecker_l2_norm.pdf}
& \hspace{-1em}\includegraphics[scale=.28]{figures/kronecker_l2_norm_nonsparse.pdf}
& \hspace{-3em}\includegraphics[scale=.28]{figures/watts_strogatz_p_init.pdf} \\
-(d) Sparse Kronecker ($\ell_2$-norm \emph{vs.} $n$ ) & (e) Non-sparse Kronecker ($\ell_2$-norm \emph{vs.} $n$) & (f) Watts-Strogatz (F$1$ \emph{vs.} $p_{\text{init}}$)
+(d) Sparse Kronecker ($\ell_2$-norm \emph{vs.} $n$) & (e) Non-sparse Kronecker
+ ($\ell_2$-norm \emph{vs.} $n$) & (f) Watts-Strogatz (F$1$ \emph{vs.}
+ $p_{\text{init}}$)
\end{tabular}
-\captionof{figure}{Figures (a) and (b) report the F$1$-score in $\log$ scale
-for 2 graphs as a function of the number of cascades $n$: (a) Barabasi-Albert graph, $300$ nodes, $16200$ edges. (b)
-Watts-Strogatz graph, $300$ nodes, $4500$ edges. Figure (c) plots the Precision-Recall curve for various values of $\lambda$ for a Holme-Kim graph ($200$ nodes, $9772$ edges). Figures (d) and (e) report the
-$\ell_2$-norm $\|\hat \Theta - \Theta\|_2$ for a Kronecker graph which is: (d) exactly sparse (e) non-exactly sparse, as a function of the number of cascades $n$. Figure (f) plots the F$1$-score for the Watts-Strogatz graph as a function of $p_{init}$.}
-\label{fig:four_figs}
+\captionof{figure}{Figures (a) and (b) report the F$1$-score in $\log$ scale for
+ 2 graphs as a function of the number of cascades $n$: (a) Barabasi-Albert
+ graph, $300$ nodes, $16200$ edges. (b) Watts-Strogatz graph, $300$ nodes,
+ $4500$ edges. Figure (c) plots the Precision-Recall curve for various values
+ of $\lambda$ for a Holme-Kim graph ($200$ nodes, $9772$ edges). Figures (d)
+ and (e) report the $\ell_2$-norm $\|\hat \Theta - \Theta\|_2$ for a Kronecker
+graph which is: (d) exactly sparse (e) non-exactly sparse, as a function of the
+number of cascades $n$. Figure (f) plots the F$1$-score for the Watts-Strogatz
+graph as a function of $p_{init}$.}~\label{fig:four_figs}
\end{table*}
-In this section, we validate empirically the results and assumptions of Section~\ref{sec:results} for varying levels of sparsity and different initializations of parameters ($n$, $m$, $\lambda$, $p_{\text{init}}$), where $p_{\text{init}}$ is the initial probability of a node being a source node. We compare our algorithm to two different state-of-the-art algorithms: \textsc{greedy} and \textsc{mle} from \cite{Netrapalli:2012}. As an extra benchmark, we also introduce a new algorithm \textsc{lasso}, which approximates our \textsc{sparse mle} algorithm.
+In this section, we validate empirically the results and assumptions of
+Section~\ref{sec:results} for varying levels of sparsity and different
+initializations of parameters ($n$, $m$, $\lambda$, $p_{\text{init}}$), where
+$p_{\text{init}}$ is the initial probability of a node being a source node. We
+compare our algorithm to two different state-of-the-art algorithms:
+\textsc{greedy} and \textsc{mle} from~\cite{Netrapalli:2012}. As an extra
+benchmark, we also introduce a new algorithm \textsc{lasso}, which approximates
+our \textsc{sparse mle} algorithm.
\paragraph{Experimental setup}
-We evaluate the performance of the algorithms on synthetic graphs, chosen for their similarity to real social networks. We therefore consider a Watts-Strogatz graph ($300$ nodes, $4500$ edges) \cite{watts:1998}, a Barabasi-Albert graph ($300$ nodes, $16200$ edges) \cite{barabasi:2001}, a Holme-Kim power law graph ($200$ nodes, $9772$ edges) \cite{Holme:2002}, and the recently introduced Kronecker graph ($256$ nodes, $10000$ edges) \cite{Leskovec:2010}. Undirected graphs are converted to directed graphs by doubling the edges.
+We evaluate the performance of the algorithms on synthetic graphs, chosen for
+their similarity to real social networks. We therefore consider a Watts-Strogatz
+graph ($300$ nodes, $4500$ edges)~\cite{watts:1998}, a Barabasi-Albert graph
+($300$ nodes, $16200$ edges)~\cite{barabasi:2001}, a Holme-Kim power law graph
+($200$ nodes, $9772$ edges)~\cite{Holme:2002}, and the recently introduced
+Kronecker graph ($256$ nodes, $10000$ edges)~\cite{Leskovec:2010}. Undirected
+graphs are converted to directed graphs by doubling the edges.
For every reported data point, we sample edge weights and generate $n$ cascades
-from the (IC) model for $n \in \{100, 500, 1000, 2000, 5000\}$. We compare for each algorithm the estimated graph $\hat {\cal G}$ with ${\cal G}$. The initial probability of a node being a source is fixed to $0.05$, i.e. an average of $15$ nodes source nodes per cascades for all experiments, except for Figure~\label{fig:four_figs} (f). All edge weights are chosen uniformly in the interval $[0.2, 0.7]$, except when testing for approximately sparse graphs (see paragraph on robustness). Adjusting for the variance of our experiments, all data points are reported with at most a $\pm 1$ error margin. The parameter $\lambda$ is chosen to be of the order ${\cal O}(\sqrt{\log m / (\alpha n)})$.
+from the (IC) model for $n \in \{100, 500, 1000, 2000, 5000\}$. We compare for
+each algorithm the estimated graph $\hat {\cal G}$ with ${\cal G}$. The initial
+probability of a node being a source is fixed to $0.05$, i.e.\ an average of $15$
+nodes source nodes per cascades for all experiments, except for
+Figure~\label{fig:four_figs} (f). All edge weights are chosen uniformly in the
+interval $[0.2, 0.7]$, except when testing for approximately sparse graphs (see
+paragraph on robustness). Adjusting for the variance of our experiments, all
+data points are reported with at most a $\pm 1$ error margin. The parameter
+$\lambda$ is chosen to be of the order ${\cal O}(\sqrt{\log m / (\alpha n)})$.
\paragraph{Benchmarks}
-We compare our \textsc{sparse mle} algorithm to 3 benchmarks: \textsc{greedy} and \textsc{mle} from \cite{Netrapalli:2012} and \textsc{lasso}. The \textsc{mle} algorithm is a maximum-likelihood estimator without $\ell_1$-norm penalization. \textsc{greedy} is an iterative algorithm. We introduced the \textsc{lasso} algorithm in our experiments to achieve faster computation time:
-$$\hat \theta_i \in \arg \min_{\theta} \sum_{t \in {\cal T}} |f(\theta_i\cdot x^t) - x_i^{t+1}|^2 + \lambda \|\theta_i\|_1$$
-\textsc{Lasso} has the merit of being both easier and faster to optimize numerically than the other convex-optimization based algorithms. It approximates the $\textsc{sparse mle}$ algorithm by making the assumption that the observations $x_i^{t+1}$ are of the form: $x_i^{t+1} = f(\theta_i\cdot x^t) + \epsilon$, where $\epsilon$ is random white noise. This is not valid in theory since $\epsilon$ \emph{depends on} $f(\theta_i\cdot x^t)$, however the approximation is validated in practice.
+We compare our \textsc{sparse mle} algorithm to 3 benchmarks: \textsc{greedy}
+and \textsc{mle} from~\cite{Netrapalli:2012} and \textsc{lasso}. The
+\textsc{mle} algorithm is a maximum-likelihood estimator without $\ell_1$-norm
+penalization. \textsc{greedy} is an iterative algorithm. We introduced the
+\textsc{lasso} algorithm in our experiments to achieve faster computation time:
+$$\hat \theta_i \in \arg \min_{\theta} \sum_{t \in {\cal T}} |f(\theta_i\cdot
+x^t) - x_i^{t+1}|^2 + \lambda \|\theta_i\|_1$$ \textsc{Lasso} has the merit of
+being both easier and faster to optimize numerically than the other
+convex-optimization based algorithms. It approximates the $\textsc{sparse mle}$
+algorithm by making the assumption that the observations $x_i^{t+1}$ are of the
+form: $x_i^{t+1} = f(\theta_i\cdot x^t) + \epsilon$, where $\epsilon$ is random
+white noise. This is not valid in theory since $\epsilon$ \emph{depends on}
+$f(\theta_i\cdot x^t)$, however the approximation is validated in practice.
-We did not benchmark against other known algorithms (\textsc{netrate} \cite{gomezbalduzzi:2011} and \textsc{first edge} \cite{Abrahao:13}) due to the discrete-time assumption. These algorithms also suppose a single-source model, whereas \textsc{sparse mle}, \textsc{mle}, and \textsc{greedy} do not. Learning the graph in the case of a multi-source cascade model is harder (see Figure~\ref{fig:four_figs} (f)) but more realistic, since we rarely have access to ``patient 0'' in practice.
+We did not benchmark against other known algorithms (\textsc{netrate}
+\cite{gomezbalduzzi:2011} and \textsc{first edge}~\cite{Abrahao:13}) due to the
+discrete-time assumption. These algorithms also suppose a single-source model,
+whereas \textsc{sparse mle}, \textsc{mle}, and \textsc{greedy} do not. Learning
+the graph in the case of a multi-source cascade model is harder (see
+Figure~\ref{fig:four_figs} (f)) but more realistic, since we rarely have access
+to ``patient 0'' in practice.
\paragraph{Graph Estimation}
-In the case of the \textsc{lasso}, \textsc{mle} and \textsc{sparse mle} algorithms, we construct the edges of $\hat {\cal G} : \cup_{j \in V} \{(i,j) : \Theta_{ij} > 0.1\}$, \emph{i.e} by thresholding. Finally, we report the F1-score$=2 \text{precision}\cdot\text{recall}/(\text{precision}+\text{recall})$, which considers \emph{(1)} the number of true edges recovered by the algorithm over the total number of edges returned by the algorithm (\emph{precision}) and \emph{(2)} the number of true edges recovered by the algorithm over the total number of edges it should have recovered (\emph{recall}).
-Over all experiments, \textsc{sparse mle} achieves higher rates of precision,
-recall, and F1-score. Interestingly, both \textsc{mle} and \textsc{sparse mle} perform exceptionally well on the Watts-Strogatz graph.
+In the case of the \textsc{lasso}, \textsc{mle} and \textsc{sparse mle}
+algorithms, we construct the edges of $\hat {\cal G} : \cup_{j \in V} \{(i,j) :
+\Theta_{ij} > 0.1\}$, \emph{i.e} by thresholding. Finally, we report the
+F1-score$=2
+\text{precision}\cdot\text{recall}/(\text{precision}+\text{recall})$, which
+considers \emph{(1)} the number of true edges recovered by the algorithm over
+the total number of edges returned by the algorithm (\emph{precision}) and
+\emph{(2)} the number of true edges recovered by the algorithm over the total
+number of edges it should have recovered (\emph{recall}). Over all experiments,
+\textsc{sparse mle} achieves higher rates of precision, recall, and F1-score.
+Interestingly, both \textsc{mle} and \textsc{sparse mle} perform exceptionally
+well on the Watts-Strogatz graph.
+
\begin{comment}
- The recovery rate converges at
-around $5000$ cascades, which is more than $15$ times the number of nodes. By
-contrast, \textsc{sparse mle} achieves a reasonable F$1$-score of $.75$ for roughly $500$ observed cascades.
+ The recovery rate converges at around $5000$ cascades, which is more than
+ $15$ times the number of nodes. By contrast, \textsc{sparse mle} achieves a
+ reasonable F$1$-score of $.75$ for roughly $500$ observed cascades.
\end{comment}
\paragraph{Quantifying robustness}
@@ -60,6 +117,6 @@ The previous experiments only considered graphs with strong edges. To test the
algorithms in the approximately sparse case, we add sparse edges to the
previous graphs according to a bernoulli variable of parameter $1/3$ for every
non-edge, and drawing a weight uniformly from $[0,0.1]$. The non-sparse case is
-compared to the sparse case in Figure~\ref{fig:four_figs} (d)--(e) for the $\ell_2$
-norm showing that both the \textsc{lasso}, followed by \textsc{sparse mle} are
-the most robust to noise.
+compared to the sparse case in Figure~\ref{fig:four_figs} (d)--(e) for the
+$\ell_2$ norm showing that both the \textsc{lasso}, followed by \textsc{sparse
+mle} are the most robust to noise.
diff --git a/paper/sections/intro.tex b/paper/sections/intro.tex
index a04b5f1..4d44395 100644
--- a/paper/sections/intro.tex
+++ b/paper/sections/intro.tex
@@ -12,9 +12,9 @@ of nodes which become `infected' over time without knowledge of who has
`infected' whom, can we recover the graph on which the process takes place? The
spread of a particular behavior through a network is known as an {\it Influence
Cascade}. In this context, the {\it Graph Inference}\ problem is the recovery of
-the graph's edges from the observation of few influence cascades. We propose to
-show how sparse recovery can be applied to solve this recently introduced graph
-inference problem.
+the graph's edges from the observation of few influence cascades. {\color{red}
+Cite references} We propose to show how sparse recovery can be applied to solve
+this recently introduced graph inference problem.
{\color{red} Graph inference to Network inference}
Tackling the graph inference problem means constructing a polynomial-time
@@ -70,8 +70,8 @@ as a ``sparse signal'' measured through influence cascades and then recovered.
The challenge is that influence cascade models typically lead to non-linear
inverse problems. The sparse recovery literature suggests that
$\Omega(s\log\frac{m}{s})$ cascade observations should be sufficient to recover
-the graph. However, the best known upper bound to this day is $\O(s^2\log
-m)$.{\color{red} Add reference}
+the graph.{\color{red} Add reference} However, the best known upper bound to
+this day is $\O(s^2\log m)$.{\color{red} Add reference}
The contributions of this paper are the following:
@@ -83,7 +83,8 @@ The contributions of this paper are the following:
\item we give an algorithm which recovers the graph's edges using $\O(s\log
m)$ cascades. Furthermore, we show that our algorithm is also able to
recover the edge weights (the parameters of the influence model),
- a problem which has been seemingly overlooked so far.
+ a problem which has been seemingly overlooked so far. {\color{red} NOT
+ TRUE}
\item we show that our algorithm is robust in cases where the signal to
recover is approximately $s$-sparse by proving guarantees in the
\emph{stable recovery} setting.
@@ -91,10 +92,10 @@ The contributions of this paper are the following:
observations required for sparse recovery.
\end{itemize}
-The organization of the paper is as follows: we conclude the introduction by
-a survey of the related work. In Section~\ref{sec:model} we present our model
-of Generalized Linear Cascades and the associated sparse recovery formulation.
-Its theoretical guarantees are presented for various recovery settings in
+The organization of the paper is as follows: we conclude the introduction by a
+survey of the related work. In Section~\ref{sec:model} we present our model of
+Generalized Linear Cascades and the associated sparse recovery formulation. Its
+theoretical guarantees are presented for various recovery settings in
Section~\ref{sec:results}. The lower bound is presented in
Section~\ref{sec:lowerbound}. Finally, we conclude with experiments in
Section~\ref{sec:experiments}.
@@ -120,6 +121,8 @@ of~\cite{Abrahao:13} studies the same continuous-model framework as
\cite{GomezRodriguez:2010} and obtains an ${\cal O}(s^9 \log^2 s \log m)$
support recovery algorithm, without the \emph{correlation decay} assumption.
+{\color{red} Du et.~al make a citation}
+
\begin{comment}
They assume a single-source model, where only one source is selected at random,
which is less realistic in practice since `patient 0' is rarely unknown to us.
diff --git a/paper/sections/model.tex b/paper/sections/model.tex
index 19d7506..fbcedf3 100644
--- a/paper/sections/model.tex
+++ b/paper/sections/model.tex
@@ -25,7 +25,8 @@ contagious nodes ``influence'' other nodes in the graph to become contagious. An
the successive states of the nodes in graph ${\cal G}$. Note that both the
``single source'' assumption made in~\cite{Daneshmand:2014} and
\cite{Abrahao:13} as well as the ``uniformly chosen source set'' assumption made
-in~\cite{Netrapalli:2012} verify condition 3.
+in~\cite{Netrapalli:2012} verify condition 3. {\color{red} why is it less
+restrictive? explain}
In the context of Graph Inference,~\cite{Netrapalli:2012} focus
on the well-known discrete-time independent cascade model recalled below, which
@@ -103,7 +104,7 @@ In the independent cascade model, nodes can be either susceptible, contagious
or immune. At $t=0$, all source nodes are ``contagious'' and all remaining
nodes are ``susceptible''. At each time step $t$, for each edge $(i,j)$ where
$j$ is susceptible and $i$ is contagious, $i$ attempts to infect $j$ with
-probability $p_{i,j}\in]0,1]$; the infection attempts are mutually independent.
+probability $p_{i,j}\in(0,1]$; the infection attempts are mutually independent.
If $i$ succeeds, $j$ will become contagious at time step $t+1$. Regardless of
$i$'s success, node $i$ will be immune at time $t+1$. In other words, nodes
stay contagious for only one time step. The cascade process terminates when no
@@ -147,6 +148,9 @@ step $t$, then we have:
Thus, the linear voter model is a Generalized Linear Cascade model
with inverse link function $f: z \mapsto z$.
+{\color{red} \subsubsection{Discretization of Continous Model}
+TODO}
+
% \subsection{The Linear Threshold Model}
% In the Linear Threshold Model, each node $j\in V$ has a threshold $t_j$ from
@@ -171,6 +175,7 @@ with inverse link function $f: z \mapsto z$.
% X^t\right] = \text{sign} \left(\inprod{\theta_j}{X^t} - t_j \right)
% \end{equation} where ``sign'' is the function $\mathbbm{1}_{\cdot > 0}$.
+{\color{red} Add drawing of math problem as in Edo's presentation}
\subsection{Maximum Likelihood Estimation}
@@ -227,4 +232,4 @@ a twice-differentiable function $f$ is log concave iff. $f''f \leq f'^2$. It is
easy to verify this property for $f$ and $(1-f)$ in the Independent Cascade
Model and Voter Model.
-{\color{red} TODO: talk about the different constraints}
+{\color{red} TODO:~talk about the different constraints}
diff --git a/paper/sections/results.tex b/paper/sections/results.tex
index b156897..54fc587 100644
--- a/paper/sections/results.tex
+++ b/paper/sections/results.tex
@@ -92,6 +92,7 @@ $n$, which is different from the number of cascades. For example, in the case
of the voter model with horizon time $T$ and for $N$ cascades, we can expect
a number of measurements proportional to $N\times T$.
+{\color{red} Move this to the model section}
Before moving to the proof of Theorem~\ref{thm:main}, note that interpreting it
in the case of the Independent Cascade Model requires one more step. Indeed, to
cast it as a generalized linear cascade model, we had to perform the
@@ -114,8 +115,7 @@ which gives a bound on the convergence rate of regularized estimators. We state
their theorem in the context of $\ell_1$ regularization in
Lemma~\ref{lem:negahban}.
-\begin{lemma}
- \label{lem:negahban}
+\begin{lemma} \label{lem:negahban}
Let ${\cal C}(S) \defeq \{ \Delta \in \mathbb{R}^m\,|\,\|\Delta_S\|_1 \leq
3 \|\Delta_{S^c}\|_1 \}$. Suppose that:
\begin{multline}
@@ -142,6 +142,8 @@ implied by the (RE)-condition.. The upper bound
on the $\ell_{\infty}$ norm of $\nabla\mathcal{L}(\theta^*)$ is given by
Lemma~\ref{lem:ub}.
+{\color{red} explain usefulness/interpretation and contribution}
+{\color{red} Sketch proof, full proof in appendix}
\begin{lemma}
\label{lem:ub}
Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$:
@@ -245,32 +247,35 @@ In other words, the closer $\theta^*$ is to being sparse, the smaller the
price, and we recover the results of Section~\ref{sec:main_theorem} in the
limit of exact sparsity. These results are formalized in the following theorem,
which is also a consequence of Theorem 1 in \cite{Negahban:2009}.
+{\color{red} Include full proof in appendix}
\begin{theorem}
\label{thm:approx_sparse}
-Suppose the {\bf(RE)} assumption holds for the Hessian $\nabla^2
-f(\theta^*)$ and $\tau_{\mathcal{L}}(\theta^*) = \frac{\kappa_2\log m}{n}\|\theta^*\|_1$
-on the following set:
+Suppose the {\bf(RE)} assumption holds for the Hessian $\nabla^2 f(\theta^*)$
+and $\tau_{\mathcal{L}}(\theta^*) = \frac{\kappa_2\log m}{n}\|\theta^*\|_1$ on
+the following set:
\begin{align}
\nonumber
{\cal C}' \defeq & \{X \in \mathbb{R}^p : \|X_{S^c}\|_1 \leq 3 \|X_S\|_1
+ 4 \|\theta^* - \theta^*_{\lfloor s \rfloor}\|_1 \} \\ \nonumber
& \cap \{ \|X\|_1 \leq 1 \}
\end{align}
-If the number of measurements $n\geq \frac{64\kappa_2}{\gamma}s\log m$, then
-by solving \eqref{eq:pre-mle} for $\lambda \defeq 2\sqrt{\frac{\log m}{\alpha n^{1 - \delta}}}$ we have:
+If the number of measurements $n\geq \frac{64\kappa_2}{\gamma}s\log m$, then by
+solving \eqref{eq:pre-mle} for $\lambda \defeq 2\sqrt{\frac{\log m}{\alpha n^{1
+- \delta}}}$ we have:
\begin{align*}
- \|\hat \theta - \theta^* \|_2 \leq
- \frac{3}{\gamma} \sqrt{\frac{s\log m}{\alpha n^{1-\delta}}}
- + 4 \sqrt[4]{\frac{s\log m}{\gamma^4\alpha n^{1-\delta}}} \|\theta^* - \theta^*_{\lfloor s \rfloor}\|_1
+ \|\hat \theta - \theta^* \|_2 \leq \frac{3}{\gamma} \sqrt{\frac{s\log
+ m}{\alpha n^{1-\delta}}} + 4 \sqrt[4]{\frac{s\log m}{\gamma^4\alpha
+ n^{1-\delta}}} \|\theta^* - \theta^*_{\lfloor s \rfloor}\|_1
\end{align*}
\end{theorem}
-As before, edge recovery is a consequence of upper-bounding $\|\theta^* - \hat \theta\|_2$
+As before, edge recovery is a consequence of upper-bounding $\|\theta^* - \hat
+\theta\|_2$
\begin{corollary}
- Under the same assumptions as Theorem~\ref{thm:approx_sparse}, if the number of
- measurements verifies: \begin{equation}
+ Under the same assumptions as Theorem~\ref{thm:approx_sparse}, if the number
+ of measurements verifies: \begin{equation}
n > \frac{9}{\alpha\gamma^2\epsilon^2}\left(1+
\frac{16}{\epsilon^2}\| \theta^* - \theta^*_{\lfloor
s\rfloor}\|_1\right)s\log m
@@ -279,8 +284,6 @@ then similarly: ${\cal S}^*_{\eta + \epsilon} \subset \hat {\cal S}_\eta
\subset {\cal S}^*$ w.p. at least $1-\frac{1}{m}$.
\end{corollary}
-
-
\subsection{Restricted Eigenvalue Condition}
\label{sec:re}
@@ -312,6 +315,10 @@ a re-weighted Gram matrix of the observations. In other words, the restricted
eigenvalue condition sates that the observed set of active nodes are not
too collinear with each other.
+{\color{red} if the function is strictly log-convex, then equivalent -> explain
+what the gram matrix is (explanation)}
+
+{\color{red} move to model section, small example}
In the specific case of ``logistic cascades'' (when $f$ is the logistic
function), the Hessian simplifies to $\nabla^2\mathcal{L}(\theta^*)
= \frac{1}{|\mathcal{T}|}XX^T$ where $X$ is the observation matrix $[x^1 \ldots
@@ -351,13 +358,14 @@ again non restrictive in the (IC) model and (V) model.
\begin{proposition}
\label{prop:fi}
- Suppose $\E[\nabla^2\mathcal{L}(\theta^*)]$ verifies the $(S,\gamma)$-{\bf (RE)}
- condition and assume {\bf (LF)} and {\bf (LF2)}. For $\delta> 0$, if $n^{1-\delta}\geq
-\frac{M+2}{21\gamma\alpha}s^2\log m
- $, then $\nabla^2\mathcal{L}(\theta^*)$ verifies the $(S,\frac{\gamma}{2})$-(RE)
+ Suppose $\E[\nabla^2\mathcal{L}(\theta^*)]$ verifies the $(S,\gamma)$-{\bf
+ (RE)} condition and assume {\bf (LF)} and {\bf (LF2)}. For $\delta> 0$, if
+ $n^{1-\delta}\geq \frac{M+2}{21\gamma\alpha}s^2\log m $, then
+ $\nabla^2\mathcal{L}(\theta^*)$ verifies the $(S,\frac{\gamma}{2})$-(RE)
condition, w.p $\geq 1-e^{-n^\delta\log m}$.
\end{proposition}
+{\color{red} sketch proof, full (AND BETTER) proof in appendix}
\begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if
$ \forall\Delta\in C(S),\;
\|\E[H] - H]\|_\infty\leq \lambda $
@@ -366,8 +374,8 @@ again non restrictive in the (IC) model and (V) model.
\begin{equation}
\label{eq:foo}
\forall \Delta\in C(S),\;
- \Delta H\Delta \geq
- \Delta \E[H]\Delta(1-32s\lambda/\gamma)
+ \Delta H\Delta \geq
+ \Delta \E[H]\Delta(1-32s\lambda/\gamma)
\end{equation}
Indeed, $
|\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq
@@ -407,11 +415,16 @@ likelihood function also known as the {\it (S,s)-irrepresentability} condition.
\begin{comment}
\begin{definition}
-Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and $S^c$ be the set of indices of all the parents and non-parents respectively and $Q_{S,S}$, $Q_{S^c,S}$, $Q_{S, S^c}$, and $Q_{S^c, S^c}$ the induced sub-matrices. Consider the following constant:
+Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and
+$S^c$ be the set of indices of all the parents and non-parents respectively and
+$Q_{S,S}$, $Q_{S^c,S}$, $Q_{S, S^c}$, and $Q_{S^c, S^c}$ the induced
+sub-matrices. Consider the following constant:
\begin{equation}
-\nu_{\text{irrepresentable}}(S) \defeq \max_{\tau \in \mathbb{R}^p \ :\ \| \tau \|_{\infty} \leq 1} \|Q_{S^c, S} Q_{S, S}^{-1} \tau\|_{\infty}
+\nu_{\text{irrepresentable}}(S) \defeq \max_{\tau \in \mathbb{R}^p \ :\ \| \tau
+\|_{\infty} \leq 1} \|Q_{S^c, S} Q_{S, S}^{-1} \tau\|_{\infty}
\end{equation}
-The (S,s)-irrepresentability holds if $\nu_{\text{irrepresentable}}(S) < 1 - \epsilon$ for $\epsilon > 0$
+The (S,s)-irrepresentability holds if $\nu_{\text{irrepresentable}}(S) < 1 -
+\epsilon$ for $\epsilon > 0$
\end{definition}
\end{comment}
@@ -423,7 +436,11 @@ the {\bf(RE)} condition for $\ell_2$-recovery.
\begin{comment}
\begin{proposition}
\label{prop:irrepresentability}
-If the irrepresentability condition holds with $\epsilon > \frac{2}{3}$, then the restricted eigenvalue condition holds with constant $\gamma_n \geq \frac{ (1 - 3(1 -\epsilon))^2 \lambda_{\min}^2}{4s}n$, where $\lambda_{\min} > 0$ is the smallest eigenvalue of $Q^*_{S,S}$, on which the results of \cite{Daneshmand:2014} also depend.
+If the irrepresentability condition holds with $\epsilon > \frac{2}{3}$, then
+the restricted eigenvalue condition holds with constant $\gamma_n \geq \frac{
+(1 - 3(1 -\epsilon))^2 \lambda_{\min}^2}{4s}n$, where $\lambda_{\min} > 0$ is
+the smallest eigenvalue of $Q^*_{S,S}$, on which the results of
+\cite{Daneshmand:2014} also depend.
\end{proposition}
\end{comment}
@@ -434,18 +451,36 @@ a correlation between variables. Consider the following simplified example from
\cite{vandegeer:2011}:
\begin{equation}
\nonumber
-\left(
+\left(
\begin{array}{cccc}
I_s & \rho J \\
\rho J & I_s \\
\end{array}
\right)
\end{equation}
-where $I_s$ is the $s \times s$ identity matrix, $J$ is the all-ones matrix and $\rho \in \mathbb{R}^+$. It is easy to see that $\nu_{\text{irrepresentable}}(S) = \rho s$ and $\lambda_{\min}(Q) \geq 1 - \rho$, such that for any $\rho > \frac{1}{s}$ and $\rho < 1$, the restricted eigenvalue holds trivially but the (S,s)-irrepresentability does not hold.
+where $I_s$ is the $s \times s$ identity matrix, $J$ is the all-ones matrix and
+$\rho \in \mathbb{R}^+$. It is easy to see that $\nu_{\text{irrepresentable}}(S)
+= \rho s$ and $\lambda_{\min}(Q) \geq 1 - \rho$, such that for any $\rho >
+\frac{1}{s}$ and $\rho < 1$, the restricted eigenvalue holds trivially but the
+(S,s)-irrepresentability does not hold.
\begin{lemma}
-Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in \mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where $\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar {\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that ${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0, \; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle \geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$. Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where ${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda - \theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) + \frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal R}(\theta^*_{{\cal M}^\perp}\}$$
+Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in
+\mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal
+R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where
+$\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar
+{\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that
+${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0,
+\; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* +
+\Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle
+\geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal
+M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$.
+Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where
+${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda -
+\theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) +
+\frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal
+R}(\theta^*_{{\cal M}^\perp}\}$$
\end{lemma}
\subsection{The Independent Cascade Model}