2 files changed, 57 insertions, 46 deletions
diff --git a/paper/sections/lowerbound.tex b/paper/sections/lowerbound.tex
index b714766..7f51f46 100644
--- a/paper/sections/lowerbound.tex
+++ b/paper/sections/lowerbound.tex
@@ -1,63 +1,61 @@
-In this section, we will consider the stable sparse recovery setting of
-Section~\ref{sec:relaxing_sparsity}.  Our goal is to obtain an information-theoretic
-lower bound on the number of measurements necessary to approximately recover
-the parameter $\theta$ of a cascade model from observed cascades. Similar lower
-bounds were obtained for sparse linear inverse problems in \cite{pw11, pw12,
-bipw11}
+In \cite{Netrapalli:2012}, the authors explicitate a lower bound of
+$\Omega(s\log\frac{n}{s})$ on the number of cascades necessary to achieve good
+support recovery with constant probability under a \emph{correlation decay}
+assumption.  In this section, we will consider the stable sparse recovery
+setting of Section~\ref{sec:relaxing_sparsity}.  Our goal is to obtain an
+information-theoretic lower bound on the number of measurements necessary to
+approximately recover the parameter $\theta$ of a cascade model from observed
+cascades. Similar lower bounds were obtained for sparse \emph{linear} inverse
+problems in \cite{pw11, pw12, bipw11}.
 
 \begin{theorem}
     \label{thm:lb}
     Let us consider a cascade model of the form \eqref{eq:glm} and a recovery algorithm
     $\mathcal{A}$ which takes as input $n$ random cascade measurements and
-    outputs $\hat{\theta}$ such that with probability $\delta<\frac{1}{2}$
+    outputs $\hat{\theta}$ such that with probability $\delta>\frac{1}{2}$
     (over the measurements):
-    \begin{displaymath}
+    \begin{equation}
+        \label{eq:lb}
         \|\hat{\theta}-\theta^*\|_2\leq
         C\min_{\|\theta\|_0\leq s}\|\theta-\theta^*\|_2
-    \end{displaymath}
-    where $\theta^*$ is the true paramter of the cascade model. Then $n
+    \end{equation}
+    where $\theta^*$ is the true parameter of the cascade model. Then $n
     = \Omega(s\log\frac{m}{s}/\log C)$.
 \end{theorem}
 
 This theorem should be contrasted with Theorem~\ref{thm:approx_sparse}: up to an additive
 $s\log s$ factor, the number of measurements required by our algorithm is
-tight.
+tight.  The proof of Theorem~\ref{thm:lb} follows an approach similar to \cite{pw12}.
+We only present a sketch of the proof here and refer the reader to their paper
+for more details.
 
-The proof of Theorem~\ref{thm:lb} follows an approach similar to \cite{pw12}.
-Let us consider an algorithm $\mathcal{A}$ as in the theorem. Intuitively,
-$\mathcal{A}$ allows Alice and Bob to send $\Omega(s\log\frac{m}{s})$ quantity
-of information over a Gaussian channel. By the Shannon-Hartley theorem, this
-quantity of information is $O(n\log C)$.  These two bounds together give the
-theorem.
+Let us consider an algorithm $\mathcal{A}$ which verifies the recovery
+guarantee of Theorem~\ref{thm:lb}: there exists a probability distribution over
+measurements such that for all vectors $\theta^*$, \eqref{eq:lb} holds w.p.
+$\delta$. This implies by the probabilistic method that for all distribution
+$D$ over vectors $\theta$, there exists an $n\times m$ measurement matrix $X_D$ with
+such that \eqref{eq:lb} holds w.p. $\delta$ ($\theta$ is now the
+random variable).
 
-Formally, let $\mathcal{F}\subset \{S\subset [1..m]\,|\, |S|=s\}$ be a family
-of $s$-sparse supports such that:
-\begin{itemize}
-    \item $|S\Delta S'|\geq s$ for $S\neq S'\in\mathcal{F}$ .
-    \item $\P_{S\in\mathcal{F}}[i\in S]=\frac{s}{m}$ for all $i\in [1..m]$ and
-        when $S$ is chosen uniformly at random in $\mathcal{F}$
-    \item $\log|\mathcal{F}| = \Omega(s\log\frac{m}{s})$
-\end{itemize}
-such a family can be obtained by considering a linear code (see details in
-\cite{pw11}. Let $X = \big\{t\in\{-1,0,1\}^m\,|\, \mathrm{supp}(t)\in\mathcal{F}\big\}$. 
+Consider the following distribution $D$: choose $S$
+uniformly at random from a ``well-chosen'' set of $s$-sparse supports
+$\mathcal{F}$ and $t$ uniformly at random from $X
+\defeq\big\{t\in\{-1,0,1\}^m\,|\, \mathrm{supp}(t)\in\mathcal{F}\big\}$. Define
+$\theta = t + w$ where $w\sim\mathcal{N}(0, \alpha\frac{s}{m}I_m})$ and $\alpha
+= \Omega(\frac{1}{C})$.
 
-Consider the following communication game between Alice and Bob.
+Consider the following communication game between Alice and Bob:
 \begin{itemize}
-    \item Alice chooses $S$ uniformly at random from $\mathcal{F}$ and
-        $t$ uniformly at random from $X$ subject to $\mathrm{supp}(x) = S$
-    \item Let $w\sim\mathcal{N}(0, \alpha \frac{s}{m}I_m)$ and $\theta
-        = t + w$. Since for all $\theta$, $\mathcal{A}$ recovers $\theta$ with
-        probability $1-\delta$ over the measurements, there exists a fixed set
-        of measurements $x_1,\ldots, x_n$ such that with probability
-        $1-\delta$ over $\theta$, $\mathcal{A}$ recovers $\theta$. Alice sends
-        those measurements to Bob.
-    \item Bob uses $\mathcal{A}$ to recover $\hat{\theta}$ from the
-        measurements, then computes $\hat{t}$ the best $\ell_2$ approximation
-        of $\hat{\theta}$ in $X$.
+    \item Alice sends $y\in\reals^m$ drawn from a Bernouilli distribution of parameter
+        $f(X_D\theta)$ to Bob.
+    \item Bob uses $\mathcal{A}$ to recover $\hat{\theta}$ from $y$.
 \end{itemize}
+It can be shown that at the end of the game Bob now has a quantity of
+information $\Omega(s\log \frac{m}{s})$ about $S$. By the Shannon-Hartley
+theorem, this information is also
+upper-bounded by $\O(n\log C)$. These two bounds together imply the theorem.
 
-We have the following from \cite{pw11} and \cite{pw12} respectively.
-
+\begin{comment}
 \begin{lemma}
     \label{lemma:upperinf}
     Let $S'=\mathrm{supp}(\hat{t})$. If $\alpha = \Omega(\frac{1}{C})$, $I(S, S')
@@ -71,3 +69,15 @@ We have the following from \cite{pw11} and \cite{pw12} respectively.
 
 Lemmas \ref{lemma:lowerinf} and \ref{lemma:upperinf} together give Theorem
 \ref{thm:lb}.
+
+Formally, let $\mathcal{F}\subset \{S\subset [1..m]\,|\, |S|=s\}$ be a family
+of $s$-sparse supports such that:
+\begin{itemize}
+    \item $|S\Delta S'|\geq s$ for $S\neq S'\in\mathcal{F}$ .
+    \item $\P_{S\in\mathcal{F}}[i\in S]=\frac{s}{m}$ for all $i\in [1..m]$ and
+        when $S$ is chosen uniformly at random in $\mathcal{F}$
+    \item $\log|\mathcal{F}| = \Omega(s\log\frac{m}{s})$
+\end{itemize}
+such a family can be obtained by considering a linear code (see details in
+\cite{pw11}. Let . 
+\end{comment}
diff --git a/paper/sections/results.tex b/paper/sections/results.tex
index e5aa861..ef205f4 100644
--- a/paper/sections/results.tex
+++ b/paper/sections/results.tex
@@ -347,16 +347,18 @@ Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and $
 The (S,s)-irrepresentability holds if $\nu_{\text{irrepresentable}}(S) < 1 - \epsilon$ for $\epsilon > 0$
 \end{definition}
 
-It is intuitive that the irrepresentability condition is stronger than the
-{\bf(RE)} assumption. In fact, a slightly modified result from
-\cite{vandegeer:2009} shows that a `strong' irrepresentability condition
-directly {\it implies} the {\bf(RE)} condition for $\ell_2$-recovery:
+It is possible to construct examples where the (RE) condition holds but not the
+irrepresentability condition \cite{vandegeer:2009}. In fact, a slightly
+modified result from \cite{vandegeer:2009} shows that a `strong'
+irrepresentability condition directly {\it implies} the {\bf(RE)} condition for
+$\ell_2$-recovery:
 
 \begin{proposition}
 \label{prop:irrepresentability}
 If the irrepresentability condition holds with $\epsilon > \frac{2}{3}$, then the restricted eigenvalue condition holds with constant  $\gamma_n \geq \frac{ (1 - 3(1 -\epsilon))^2 \lambda_{\min}^2}{4s}n$, where $\lambda_{\min} > 0$ is the smallest eigenvalue of $Q^*_{S,S}$, on which the results of \cite{Daneshmand:2014} also depend.
 \end{proposition}
 
+\begin{comment}
 Furthermore, recent papers \cite{vandegeer:2011}, \cite{Zou:2006}, argue that
 the irrepresentability condition is unrealistic in situations where there is
 a correlation between variables. Consider the following simplified example from
@@ -373,7 +375,6 @@ I_s & \rho J \\
 where $I_s$ is the $s \times s$ identity matrix, $J$ is the all-ones matrix and $\rho \in \mathbb{R}^+$. It is easy to see that $\nu_{\text{irrepresentable}}(S) = \rho s$ and $\lambda_{\min}(Q) \geq 1 - \rho$, such that for any $\rho > \frac{1}{s}$ and $\rho < 1$, the restricted eigenvalue holds trivially but the (S,s)-irrepresentability does not hold.
 
 
-\begin{comment}
 \begin{lemma}
 Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in \mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where $\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar {\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that  ${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0, \; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* + \Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle \geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$. Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where ${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda - \theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) + \frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal R}(\theta^*_{{\cal M}^\perp}\}$$
 \end{lemma}