1 files changed, 146 insertions, 26 deletions
diff --git a/paper/sections/results.tex b/paper/sections/results.tex
index 31e478c..216ad65 100644
--- a/paper/sections/results.tex
+++ b/paper/sections/results.tex
@@ -1,50 +1,162 @@
-In this section, we exploit standard techniques in sparse recovery and leverage the simple nature of generalized linear cascades to address the standard problem of edge detection. We extend prior work by showing that edge weights of the graph can be recovered under similar assumptions. We extend these results by considering the non-exactly sparse case.
+In this section, we apply the sparse recovery framework to analyze under which
+assumptions our program~\eqref{eq:pre-mle} recovers the true parameter
+$\theta_i$ of the cascade model. Furthermore, if we can estimate $\theta_i$ to
+a sufficiently good accuracy, it is then possible to recover the support of
+$\theta_i$ by simple thresholding, which provides a solution to the standard
+Graph Inference problem.
 
-As mentioned previously, our objective is twofold:
+We will first give results in the exactly sparse setting in which $\theta_i$
+has a support of size exactly $s$. We will then relax this sparsity constraint
+and give results in the \emph{stable recovery} setting where $\theta_i$ is
+approximately $s$-sparse.
+
+As mentioned in Section~\ref{sec:mle}, the maximum likelihood estimation
+program is decomposable. We will henceforth focus on a single node $i\in V$ and
+omit the subscript $i$ in the notations when there is no ambiguity. The
+recovery problem is now the one of estimating a single vector $\theta^*$ from
+a set $\mathcal{T}$ of observations. We will write $n\defeq |\mathcal{T}|$.
 
+\begin{comment}
+As mentioned previously, our objective is twofold:
 \begin{enumerate}
-\item We seek to recover the edges of the graph. In other words, for each node $j$, we wish to identify the set of indices $\{i: \Theta_{ij} \neq 0\}$ . 
-\item We seek to recover the edge weights $\Theta$ of ${\cal G}$ i.e. upper-bound the error $\|\hat \Theta - \Theta \|_2$.
+    \item We seek to recover the edges of the graph. In other words, for each
+        node $j$, we wish to identify the set of indices $\{i: \Theta_{ij} \neq
+        0\}$ . 
+    \item We seek to recover the edge weights $\Theta$ of ${\cal G}$ i.e.
+        upper-bound the error $\|\hat \Theta - \Theta \|_2$.
 \end{enumerate}
-It is easy to see that solving the second objective allows us to solve the first. If $\|\hat \Theta - \Theta \|_2$ is sufficiently small, we can recover all large coordinates of $\Theta$ by keeping only the coordinates above a certain threshold.
+It is easy to see that solving the second objective allows us to solve the
+first. If $\|\hat \Theta - \Theta \|_2$ is sufficiently small, we can recover
+all large coordinates of $\Theta$ by keeping only the coordinates above
+a certain threshold.
+\end{comment}
 
 \subsection{Main Theorem}
 \label{sec:main_theorem}
 
-For the remainder of this section, we consider a single node $i$. For ease of presentation, the index $j$ is made implicit: $\theta_i \leftarrow \Theta_{ij}$ and $\theta \leftarrow (\Theta_{i\cdot})$. We begin our analysis with the following lemma:
-\begin{lemma}
-\label{lem:icc_lambda_upper_bound}
-For any $\delta > 0$, with probability $1-e^{-n^\delta \log m}$, $\|\nabla {\cal L}(\theta^*)\|_{\infty} \leq 2 \sqrt{\frac{\log m}{n^{1 - \delta} p_{\min}}}$
-\end{lemma}
-
-Consider the well-known \emph{restricted eigenvalue condition} for a symetric matrix $\Sigma$ and set ${\cal C} := \{X \in \mathbb{R}^p : \|X_{S^c}\|_1 \leq 3 \|X_S\|_1 \} \cap \{ \|X\|_1 \leq 1 \}$
+In this section, we analyze the case where $\theta^*$ is exactly sparse. We
+write $S\defeq\text{supp}(\theta^*)$ and $s=|S|$. Our main theorem will rely
+on the now standard \emph{restricted eigenvalue condition}.
 
+\begin{definition}
+    Let $\Sigma\in\mathcal{S}_m(\reals)$ be a real symmetric matrix and $S$ be
+    a subset of $\{1,\ldots,m\}$. Defining $\mathcal{C}(S)\defeq
+    \{X\in\reals^m\,:\,\|X\|_1\leq 1\text{ and } \|X_{S^c}\|_1\leq
+    3\|X_S\|_1\}$. We say that $\Sigma$ satisfies the
+    $(S,\gamma)$-\emph{restricted eigenvalue condition} iff:
 \begin{equation}
-\nonumber
-\forall X \in {\cal C}, \| \Sigma X \|_2^2 \geq \gamma_n \|X\|_2^2 
+    \forall X \in {\cal C(S)}, \| \Sigma X \|_2^2 \geq \gamma \|X\|_2^2 
 \tag{RE}
+\label{eq:re}
 \end{equation}
+\end{definition}
 
-The following theorem is a consequence of Theorem 1 in \cite{Negahban:2009} and Lemma~\ref{lem:icc_lambda_upper_bound}.
+A discussion of this assumption in the context of generalized linear cascade
+models can be found in Section~\ref{sec:}.
+
+We will also need the following assumption on the inverse link function $f$ of
+the generalized linear cascade model:
+\begin{equation}
+    \tag{LF}
+    \max\left(\left|\frac{f'}{f}(\inprod{\theta^*}{x})\right|,
+    \left|\frac{f'}{1-f}(\inprod{\theta^*}{x})\right|\right)\leq
+    \frac{1}{\alpha}
+\end{equation}
+for all $x\in\reals^m$ such that $f(\inprod{\theta^*}{x})\notin\{0,1\}$.
+
+It is easy to verify that this will be true if for all
+$(i,j)\in E$, $\delta\leq p_{i,j}\leq 1-\delta$ in the Voter model and when
+$p_{i,j}\leq 1-\delta$ in the Independent Cascade model.
 
 \begin{theorem}
-\label{thm:neghaban}
-Suppose the true vector $\theta^*$ has support S of size s and the {\bf(RE)} assumption holds for the Hessian $\nabla^2 {\cal L}(\theta)$ with $\gamma > 0$, then for $\delta > 0$ by solving \eqref{eq:pre-mle} with $\lambda \defeq 2\sqrt{\frac{\log m}{n^{1 - \delta} p_{\min}}}$, we have with probability $1-\exp(-n^\delta \log m)$:
+\label{thm:main}
+Assume the Hessian $\nabla^2\mathcal{L}(\theta^*)$ satisfies the
+$(S,\gamma)$-{\bf(RE)} for some $\gamma > 0$ and that {\bf (LF)} holds for
+some $\alpha > 0$. For any $\delta\in(0,1)$, let $\hat{\theta}$ be the solution
+of \eqref{eq:pre-mle} with $\lambda \defeq 2\sqrt{\frac{\log m}{\alpha n^{1
+- \delta}}}$ then:
 \begin{equation}
-\label{eq:sparse}
-\|\hat \theta - \theta \|_2 \leq \frac{3}{\gamma} \sqrt{\frac{s \log m}{p_{\min} n^{1-\delta}}}
+    \label{eq:sparse}
+    \|\hat \theta - \theta^* \|_2
+    \leq \frac{3}{\gamma} \sqrt{\frac{s \log m}{\alpha n^{1-\delta}}}
+\quad
+\text{w.p.}\;1-\frac{1}{e^{n^\delta \log m}}
 \end{equation}
 \end{theorem}
 
-\begin{proof}
-Adopting the notation of \cite{Negahban:2009}:
+This theorem is a consequence of Theorem~1 in \cite{Negahban:2009} which gives
+a bound on the convergence rate of regularized estimators. We state their
+theorem in the context of $\ell_1$ regularization in Lemma~\ref{lem:negahban}.
+
 \begin{lemma}
-Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in \mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where $\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar {\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that  ${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0, \; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle \geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$. Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where ${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda - \theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) + \frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal R}(\theta^*_{{\cal M}^\perp}\}$$
+    \label{lem:negahban}
+Let ${\cal C}(S) \defeq \{ \Delta \in \mathbb{R}^m\,|\,\|\Delta_S\|_1 \leq
+3 \|\Delta_{S^c}\|_1 \}$. Suppose that:
+\begin{multline}
+    \label{eq:rc}
+    \forall \Delta \in {\cal C}(S), \;
+    {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*)\\
+    - \inprod{\Delta {\cal L}(\theta^*)}{\Delta}
+    \geq \kappa_{\cal L} \|\Delta\|_2^2 - \tau_{\cal L}^2(\theta^*)
+\end{multline}
+for some $\kappa_{\cal L} > 0$ and function $\tau_{\cal L}$. Finally suppose
+that $\lambda \geq 2 \|\nabla {\cal L}(\theta^*)\|_{\infty}$, then if
+$\hat{\theta}_\lambda$ is the solution of \eqref{eq:pre-mle}:
+\begin{displaymath}
+\|\hat \theta_\lambda - \theta^* \|_2^2
+\leq 9 \frac{\lambda^2 s}{\kappa_{\cal L}}
++ \frac{\lambda}{\kappa_{\cal L}^2} 2 \tau^2_{\cal L}(\theta^*)
+\end{displaymath}
 \end{lemma}
-In the context of the Graph Inference problem, ${\cal M} \defeq \bar {\cal M} \defeq \{i \, : \, \theta_i \neq 0\}$, i.e. the subspace generated by the set of parents.${\cal R}$ is chosen to be $\ell1$-norm, which is decomposable for $({\cal M}, \bar {\cal M}^\perp)$. It follows that $\Psi(\bar {\cal M}) = \sqrt{s}$ and ${\cal R}^* = \|\|_{\infty}$. From \ref{lem:icc_lambda_upper_bound}, we can choose $\lambda \geq 2 \sqrt{\frac{\log m}{n^{1-\delta}p_{\min}}}$. The (RE) assumption ensures the existence of $\kappa_{\cal L} = \gamma > 0$ and $\tau_{\cal L} = 0$. Finally, we observe ${\cal R}(\theta^*_{{\cal M}^\perp)} = 0$ by definition of ${\cal M}^\perp$, yielding Eq.~\ref{eq:sparse}.
-\end{proof}
 
-The proof is included in the Appendix. Note that as $n$ goes to infinity, the hessian $\nabla^2 {\cal L}(\theta)$ goes to the \emph{Fisher Information Matrix} of the cascade model as a function of $\theta$. The various assumptions of theorem~\ref{thm:neghaban} are discussed in section~\ref{sec:assumptions}. The following corollary follows easily and gives the first ${\cal O}(s \log m)$ algorithm for graph reconstruction on general graphs.
+To prove Theorem~\ref{thm:main}, we apply Lemma~\ref{lem:negahban} with
+$\tau_{\mathcal{L}}=0$. Since $\mathcal{L}$ is twice differentiable and convex,
+assumption \eqref{eq:rc} is implied by the restricted eigenvalue condition
+\eqref{eq:re}. The upper bound on the $\ell_{\infty}$ norm of
+$\nabla\mathcal{L}(\theta^*)$ is given by Lemma~\ref{lem:ub}.
+
+\begin{lemma}
+\label{lem:ub}
+Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$:
+\begin{displaymath}
+    \|\nabla {\cal L}(\theta^*)\|_{\infty}
+    \leq 2 \sqrt{\frac{\log m}{\eta n^{1 - \delta}}}
+    \quad
+    \text{w.p.}\; 1-\frac{1}{e^{n^\delta \log m}}
+\end{displaymath}
+
+\begin{proof}
+The gradient of $\mathcal{L}$ is given by:
+\begin{multline*}
+    \nabla \mathcal{L}(\theta^*) =
+    \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
+        x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\
+    - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg]
+\end{multline*}
+
+Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of
+$\nabla\mathcal{L}(\theta^*)$.  Writing:
+$\partial_j\mathcal{L}(\theta^*)
+= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
+$\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
+= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.
+
+Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq \frac{1}{\alpha}$
+and we can apply Azuma's inequality to $Z_t$:
+\begin{displaymath}
+    \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
+    2\exp\left(\frac{-\lambda^2\alpha}{2n}\right)
+\end{displaymath}
+
+Applying a union bound to have the previous inequality hold for all coordinates
+of $\nabla\mathcal{L}(\theta)$ implies:
+\begin{align*}
+    \P\big[\|\nabla\mathcal{L}(\theta)\|_{\infty}\geq \lambda \big]
+    &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right)
+\end{align*}
+Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes the
+proof.
+\end{proof}
 
 \begin{corollary}
 \label{cor:variable_selection}
@@ -52,7 +164,9 @@ Assume that ${\bf (RE)}$ holds with $\gamma > 0$ and that $\theta$ is s-sparse.
 \begin{equation}
 n > \frac{36}{p_{\min}\gamma^2 \epsilon^2} s \log m
 \end{equation}
-Then with probability $1-\frac{1}{m}$, ${\cal S}^*_{\eta + \epsilon} \subset \hat {\cal S}_\eta \subset {\cal S}^*$. In other words we recover all `strong' parents and no `false' parents. 
+Then with probability $1-\frac{1}{m}$, ${\cal S}^*_{\eta + \epsilon} \subset
+\hat {\cal S}_\eta \subset {\cal S}^*$. In other words we recover all `strong'
+parents and no `false' parents. 
 \end{corollary}
 
 \begin{proof}
@@ -105,3 +219,9 @@ Using the inequality $\forall x>0, \; \log x \geq 1 - \frac{1}{x}$, we have $|\l
 \end{proof}
 
 The results of sections~\ref{sec:main_theorem} and \ref{sec:relaxing_sparsity} therefore hold for the original transition probabilities $p$.
+
+\begin{comment}
+\begin{lemma}
+Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in \mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where $\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar {\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that  ${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0, \; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle \geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$. Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where ${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda - \theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) + \frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal R}(\theta^*_{{\cal M}^\perp}\}$$
+\end{lemma}
+\end{comment}