3 files changed, 148 insertions, 71 deletions
diff --git a/paper/sections/appendix.tex b/paper/sections/appendix.tex
index 57658b4..2f0162e 100644
--- a/paper/sections/appendix.tex
+++ b/paper/sections/appendix.tex
@@ -7,50 +7,6 @@
 \end{multline*}
 \end{comment}
 
-\subsection{Upper-bound for $\|\nabla \mathcal{L}(\theta^*)\|_\infty$}
-
-The gradient of $\mathcal{L}$ (as a function of $\theta$) is given by:
-\begin{multline*}
-    \nabla \mathcal{L}(\theta) =
-    \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
-        x_i^{t+1}\frac{f'}{f}(\inprod{\theta_i}{x^t})\\
-    - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta_i}{x^t})\bigg]
-\end{multline*}
-
-Let us focus on one coordinate of the gradient, the partial derivative $\partial_j
-\mathcal{L}(\theta)$ with respect to the $j$-th coordinate.  Writing
-$\partial_j\mathcal{L}(\theta)
-= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
-$\E[x_i^{t+1}|x^t]= f(\inprod{\theta_i}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
-= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.
-
-Let us assume that we can bound $\frac{f'}{f}(\theta_i\cdot x^t)$ and
-$\frac{f'}{1-f}(\theta_i\cdot x^t)$ almost surely and in absolute value by
-$\frac{1}{\eta}$. It is easy to verify that this will be true if for all
-$(i,j)\in E$, $\delta\leq p_{i,j}\leq 1-\delta$ in the Voter model and when
-$p_{i,j}\leq 1-\delta$ in the Independent Cascade model.
-
-Under the assumption of the previous paragraph, we have almost surely
-$|Z_{t+1}-Z_t|\leq \frac{1}{\eta}$ and we can apply Azuma's inequality to the
-martingale $Z_t$:
-\begin{displaymath}
-    \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
-    2\exp\left(\frac{-\lambda^2\eta}{2n}\right)
-\end{displaymath}
-
-Applying a union bound to have the previous inequality hold for all coordinates
-of $\nabla\mathcal{L}(\theta)$ implies the following bound:
-\begin{align*}
-    \P\big[\|\nabla\mathcal{L}(\theta)\|_{\infty}\geq \lambda \big]
-    &\leq 2m\exp\left(\frac{-\lambda^2n\eta}{2}\right)\\
-    &= 2\exp\left(\frac{-\lambda^2n\eta}{2}+\log m\right)
-\end{align*}
-
-Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\eta n^{1-\delta}}}$ concludes the
-proof of Lemma~\ref{lem:icc_lambda_upper_bound}.
-
-
-
 
 \subsection{Proposition~\ref{prop:irrepresentability}}
 In the words and notation of Theorem 9.1 in \cite{vandegeer:2009}: 
diff --git a/paper/sections/model.tex b/paper/sections/model.tex
index 25f55e0..d3403ef 100644
--- a/paper/sections/model.tex
+++ b/paper/sections/model.tex
@@ -120,6 +120,7 @@ Therefore, the independent cascade model is a Generalized Linear Cascade model w
 
 
 \subsection{Maximum Likelihood Estimation}
+\label{sec:mle}
 
 Recovering the model parameter $\Theta$ from observed influence cascades is the
 central question of the present work. Recovering the edges in $E$ from observed
@@ -149,7 +150,7 @@ of $m$ terms, each term $i\in\{1,\ldots,m\}$ only depending on $\theta_i$.
 Since this is equally true for $\|\Theta\|_1$, each column $\theta_i$ of
 $\Theta$ can be estimated by a separate optimization program:
 \begin{equation}\label{eq:pre-mle}
-    \hat{\theta}_i \in \argmax_{\theta} \frac{1}{n}\mathcal{L}_i(\theta_i\,|\,x^1,\ldots,x^n)
+    \hat{\theta}_i \in \argmax_{\theta} \mathcal{L}_i(\theta_i\,|\,x^1,\ldots,x^n)
     - \lambda\|\theta_i\|_1
 \end{equation}
 where we denote by ${\cal T}_i$ the time steps at which node $i$ is susceptible and:
diff --git a/paper/sections/results.tex b/paper/sections/results.tex
index 31e478c..216ad65 100644
--- a/paper/sections/results.tex
+++ b/paper/sections/results.tex
@@ -1,50 +1,162 @@
-In this section, we exploit standard techniques in sparse recovery and leverage the simple nature of generalized linear cascades to address the standard problem of edge detection. We extend prior work by showing that edge weights of the graph can be recovered under similar assumptions. We extend these results by considering the non-exactly sparse case.
+In this section, we apply the sparse recovery framework to analyze under which
+assumptions our program~\eqref{eq:pre-mle} recovers the true parameter
+$\theta_i$ of the cascade model. Furthermore, if we can estimate $\theta_i$ to
+a sufficiently good accuracy, it is then possible to recover the support of
+$\theta_i$ by simple thresholding, which provides a solution to the standard
+Graph Inference problem.
 
-As mentioned previously, our objective is twofold:
+We will first give results in the exactly sparse setting in which $\theta_i$
+has a support of size exactly $s$. We will then relax this sparsity constraint
+and give results in the \emph{stable recovery} setting where $\theta_i$ is
+approximately $s$-sparse.
+
+As mentioned in Section~\ref{sec:mle}, the maximum likelihood estimation
+program is decomposable. We will henceforth focus on a single node $i\in V$ and
+omit the subscript $i$ in the notations when there is no ambiguity. The
+recovery problem is now the one of estimating a single vector $\theta^*$ from
+a set $\mathcal{T}$ of observations. We will write $n\defeq |\mathcal{T}|$.
 
+\begin{comment}
+As mentioned previously, our objective is twofold:
 \begin{enumerate}
-\item We seek to recover the edges of the graph. In other words, for each node $j$, we wish to identify the set of indices $\{i: \Theta_{ij} \neq 0\}$ . 
-\item We seek to recover the edge weights $\Theta$ of ${\cal G}$ i.e. upper-bound the error $\|\hat \Theta - \Theta \|_2$.
+    \item We seek to recover the edges of the graph. In other words, for each
+        node $j$, we wish to identify the set of indices $\{i: \Theta_{ij} \neq
+        0\}$ . 
+    \item We seek to recover the edge weights $\Theta$ of ${\cal G}$ i.e.
+        upper-bound the error $\|\hat \Theta - \Theta \|_2$.
 \end{enumerate}
-It is easy to see that solving the second objective allows us to solve the first. If $\|\hat \Theta - \Theta \|_2$ is sufficiently small, we can recover all large coordinates of $\Theta$ by keeping only the coordinates above a certain threshold.
+It is easy to see that solving the second objective allows us to solve the
+first. If $\|\hat \Theta - \Theta \|_2$ is sufficiently small, we can recover
+all large coordinates of $\Theta$ by keeping only the coordinates above
+a certain threshold.
+\end{comment}
 
 \subsection{Main Theorem}
 \label{sec:main_theorem}
 
-For the remainder of this section, we consider a single node $i$. For ease of presentation, the index $j$ is made implicit: $\theta_i \leftarrow \Theta_{ij}$ and $\theta \leftarrow (\Theta_{i\cdot})$. We begin our analysis with the following lemma:
-\begin{lemma}
-\label{lem:icc_lambda_upper_bound}
-For any $\delta > 0$, with probability $1-e^{-n^\delta \log m}$, $\|\nabla {\cal L}(\theta^*)\|_{\infty} \leq 2 \sqrt{\frac{\log m}{n^{1 - \delta} p_{\min}}}$
-\end{lemma}
-
-Consider the well-known \emph{restricted eigenvalue condition} for a symetric matrix $\Sigma$ and set ${\cal C} := \{X \in \mathbb{R}^p : \|X_{S^c}\|_1 \leq 3 \|X_S\|_1 \} \cap \{ \|X\|_1 \leq 1 \}$
+In this section, we analyze the case where $\theta^*$ is exactly sparse. We
+write $S\defeq\text{supp}(\theta^*)$ and $s=|S|$. Our main theorem will rely
+on the now standard \emph{restricted eigenvalue condition}.
 
+\begin{definition}
+    Let $\Sigma\in\mathcal{S}_m(\reals)$ be a real symmetric matrix and $S$ be
+    a subset of $\{1,\ldots,m\}$. Defining $\mathcal{C}(S)\defeq
+    \{X\in\reals^m\,:\,\|X\|_1\leq 1\text{ and } \|X_{S^c}\|_1\leq
+    3\|X_S\|_1\}$. We say that $\Sigma$ satisfies the
+    $(S,\gamma)$-\emph{restricted eigenvalue condition} iff:
 \begin{equation}
-\nonumber
-\forall X \in {\cal C}, \| \Sigma X \|_2^2 \geq \gamma_n \|X\|_2^2 
+    \forall X \in {\cal C(S)}, \| \Sigma X \|_2^2 \geq \gamma \|X\|_2^2 
 \tag{RE}
+\label{eq:re}
 \end{equation}
+\end{definition}
 
-The following theorem is a consequence of Theorem 1 in \cite{Negahban:2009} and Lemma~\ref{lem:icc_lambda_upper_bound}.
+A discussion of this assumption in the context of generalized linear cascade
+models can be found in Section~\ref{sec:}.
+
+We will also need the following assumption on the inverse link function $f$ of
+the generalized linear cascade model:
+\begin{equation}
+    \tag{LF}
+    \max\left(\left|\frac{f'}{f}(\inprod{\theta^*}{x})\right|,
+    \left|\frac{f'}{1-f}(\inprod{\theta^*}{x})\right|\right)\leq
+    \frac{1}{\alpha}
+\end{equation}
+for all $x\in\reals^m$ such that $f(\inprod{\theta^*}{x})\notin\{0,1\}$.
+
+It is easy to verify that this will be true if for all
+$(i,j)\in E$, $\delta\leq p_{i,j}\leq 1-\delta$ in the Voter model and when
+$p_{i,j}\leq 1-\delta$ in the Independent Cascade model.
 
 \begin{theorem}
-\label{thm:neghaban}
-Suppose the true vector $\theta^*$ has support S of size s and the {\bf(RE)} assumption holds for the Hessian $\nabla^2 {\cal L}(\theta)$ with $\gamma > 0$, then for $\delta > 0$ by solving \eqref{eq:pre-mle} with $\lambda \defeq 2\sqrt{\frac{\log m}{n^{1 - \delta} p_{\min}}}$, we have with probability $1-\exp(-n^\delta \log m)$:
+\label{thm:main}
+Assume the Hessian $\nabla^2\mathcal{L}(\theta^*)$ satisfies the
+$(S,\gamma)$-{\bf(RE)} for some $\gamma > 0$ and that {\bf (LF)} holds for
+some $\alpha > 0$. For any $\delta\in(0,1)$, let $\hat{\theta}$ be the solution
+of \eqref{eq:pre-mle} with $\lambda \defeq 2\sqrt{\frac{\log m}{\alpha n^{1
+- \delta}}}$ then:
 \begin{equation}
-\label{eq:sparse}
-\|\hat \theta - \theta \|_2 \leq \frac{3}{\gamma} \sqrt{\frac{s \log m}{p_{\min} n^{1-\delta}}}
+    \label{eq:sparse}
+    \|\hat \theta - \theta^* \|_2
+    \leq \frac{3}{\gamma} \sqrt{\frac{s \log m}{\alpha n^{1-\delta}}}
+\quad
+\text{w.p.}\;1-\frac{1}{e^{n^\delta \log m}}
 \end{equation}
 \end{theorem}
 
-\begin{proof}
-Adopting the notation of \cite{Negahban:2009}:
+This theorem is a consequence of Theorem~1 in \cite{Negahban:2009} which gives
+a bound on the convergence rate of regularized estimators. We state their
+theorem in the context of $\ell_1$ regularization in Lemma~\ref{lem:negahban}.
+
 \begin{lemma}
-Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in \mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where $\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar {\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that  ${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0, \; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle \geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$. Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where ${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda - \theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) + \frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal R}(\theta^*_{{\cal M}^\perp}\}$$
+    \label{lem:negahban}
+Let ${\cal C}(S) \defeq \{ \Delta \in \mathbb{R}^m\,|\,\|\Delta_S\|_1 \leq
+3 \|\Delta_{S^c}\|_1 \}$. Suppose that:
+\begin{multline}
+    \label{eq:rc}
+    \forall \Delta \in {\cal C}(S), \;
+    {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*)\\
+    - \inprod{\Delta {\cal L}(\theta^*)}{\Delta}
+    \geq \kappa_{\cal L} \|\Delta\|_2^2 - \tau_{\cal L}^2(\theta^*)
+\end{multline}
+for some $\kappa_{\cal L} > 0$ and function $\tau_{\cal L}$. Finally suppose
+that $\lambda \geq 2 \|\nabla {\cal L}(\theta^*)\|_{\infty}$, then if
+$\hat{\theta}_\lambda$ is the solution of \eqref{eq:pre-mle}:
+\begin{displaymath}
+\|\hat \theta_\lambda - \theta^* \|_2^2
+\leq 9 \frac{\lambda^2 s}{\kappa_{\cal L}}
++ \frac{\lambda}{\kappa_{\cal L}^2} 2 \tau^2_{\cal L}(\theta^*)
+\end{displaymath}
 \end{lemma}
-In the context of the Graph Inference problem, ${\cal M} \defeq \bar {\cal M} \defeq \{i \, : \, \theta_i \neq 0\}$, i.e. the subspace generated by the set of parents.${\cal R}$ is chosen to be $\ell1$-norm, which is decomposable for $({\cal M}, \bar {\cal M}^\perp)$. It follows that $\Psi(\bar {\cal M}) = \sqrt{s}$ and ${\cal R}^* = \|\|_{\infty}$. From \ref{lem:icc_lambda_upper_bound}, we can choose $\lambda \geq 2 \sqrt{\frac{\log m}{n^{1-\delta}p_{\min}}}$. The (RE) assumption ensures the existence of $\kappa_{\cal L} = \gamma > 0$ and $\tau_{\cal L} = 0$. Finally, we observe ${\cal R}(\theta^*_{{\cal M}^\perp)} = 0$ by definition of ${\cal M}^\perp$, yielding Eq.~\ref{eq:sparse}.
-\end{proof}
 
-The proof is included in the Appendix. Note that as $n$ goes to infinity, the hessian $\nabla^2 {\cal L}(\theta)$ goes to the \emph{Fisher Information Matrix} of the cascade model as a function of $\theta$. The various assumptions of theorem~\ref{thm:neghaban} are discussed in section~\ref{sec:assumptions}. The following corollary follows easily and gives the first ${\cal O}(s \log m)$ algorithm for graph reconstruction on general graphs.
+To prove Theorem~\ref{thm:main}, we apply Lemma~\ref{lem:negahban} with
+$\tau_{\mathcal{L}}=0$. Since $\mathcal{L}$ is twice differentiable and convex,
+assumption \eqref{eq:rc} is implied by the restricted eigenvalue condition
+\eqref{eq:re}. The upper bound on the $\ell_{\infty}$ norm of
+$\nabla\mathcal{L}(\theta^*)$ is given by Lemma~\ref{lem:ub}.
+
+\begin{lemma}
+\label{lem:ub}
+Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$:
+\begin{displaymath}
+    \|\nabla {\cal L}(\theta^*)\|_{\infty}
+    \leq 2 \sqrt{\frac{\log m}{\eta n^{1 - \delta}}}
+    \quad
+    \text{w.p.}\; 1-\frac{1}{e^{n^\delta \log m}}
+\end{displaymath}
+
+\begin{proof}
+The gradient of $\mathcal{L}$ is given by:
+\begin{multline*}
+    \nabla \mathcal{L}(\theta^*) =
+    \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
+        x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\
+    - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg]
+\end{multline*}
+
+Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of
+$\nabla\mathcal{L}(\theta^*)$.  Writing:
+$\partial_j\mathcal{L}(\theta^*)
+= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
+$\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
+= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.
+
+Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq \frac{1}{\alpha}$
+and we can apply Azuma's inequality to $Z_t$:
+\begin{displaymath}
+    \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
+    2\exp\left(\frac{-\lambda^2\alpha}{2n}\right)
+\end{displaymath}
+
+Applying a union bound to have the previous inequality hold for all coordinates
+of $\nabla\mathcal{L}(\theta)$ implies:
+\begin{align*}
+    \P\big[\|\nabla\mathcal{L}(\theta)\|_{\infty}\geq \lambda \big]
+    &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right)
+\end{align*}
+Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes the
+proof.
+\end{proof}
 
 \begin{corollary}
 \label{cor:variable_selection}
@@ -52,7 +164,9 @@ Assume that ${\bf (RE)}$ holds with $\gamma > 0$ and that $\theta$ is s-sparse.
 \begin{equation}
 n > \frac{36}{p_{\min}\gamma^2 \epsilon^2} s \log m
 \end{equation}
-Then with probability $1-\frac{1}{m}$, ${\cal S}^*_{\eta + \epsilon} \subset \hat {\cal S}_\eta \subset {\cal S}^*$. In other words we recover all `strong' parents and no `false' parents. 
+Then with probability $1-\frac{1}{m}$, ${\cal S}^*_{\eta + \epsilon} \subset
+\hat {\cal S}_\eta \subset {\cal S}^*$. In other words we recover all `strong'
+parents and no `false' parents. 
 \end{corollary}
 
 \begin{proof}
@@ -105,3 +219,9 @@ Using the inequality $\forall x>0, \; \log x \geq 1 - \frac{1}{x}$, we have $|\l
 \end{proof}
 
 The results of sections~\ref{sec:main_theorem} and \ref{sec:relaxing_sparsity} therefore hold for the original transition probabilities $p$.
+
+\begin{comment}
+\begin{lemma}
+Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in \mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where $\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar {\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that  ${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0, \; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle \geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$. Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where ${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda - \theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) + \frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal R}(\theta^*_{{\cal M}^\perp}\}$$
+\end{lemma}
+\end{comment}