5 files changed, 206 insertions, 182 deletions
diff --git a/paper/paper.tex b/paper/paper.tex
index 5311bec..e7a40a6 100644
--- a/paper/paper.tex
+++ b/paper/paper.tex
@@ -71,6 +71,7 @@
 \newtheorem{remark}{Remark}
 \newtheorem{proposition}{Proposition}
 \newtheorem{definition}{Definition}
+\newtheorem{fact}{Fact}
 
 \begin{document}
 
@@ -81,12 +82,10 @@
 % submissions: the style file will automatically remove it for you
 % unless you've provided the [accepted] option to the icml2015
 % package.
-\icmlauthor{Your Name}{email@yourdomain.edu}
-\icmladdress{Your Fantastic Institute,
-            314159 Pi St., Palo Alto, CA 94306 USA}
-\icmlauthor{Your CoAuthor's Name}{email@coauthordomain.edu}
-\icmladdress{Their Fantastic Institute,
-            27182 Exp St., Toronto, ON M6H 2T1 CANADA}
+\icmlauthor{Jean Pouget-Abadie}{jeanpougetabadie@g.harvard.edu}
+\icmladdress{Harvard University}
+\icmlauthor{Thibaut Horel}{thorel@seas.harvard.edu}
+\icmladdress{Harvard University}
 
 % You may provide any keywords that you
 % find helpful for describing your paper; these are used to populate
@@ -115,14 +114,6 @@
 \label{sec:lowerbound}
 \input{sections/lowerbound.tex}
 
-%\section{Assumptions}
-%\label{sec:assumptions}
-%\input{sections/assumptions.tex}
-
-% \section{Linear Threshold Model}
-% \label{sec:linear_threshold}
-% \input{sections/linear_threshold.tex}
-
 \section{Experiments}
 \label{sec:experiments}
 \input{sections/experiments.tex}
diff --git a/paper/sections/appendix.tex b/paper/sections/appendix.tex
index afcf186..79baf2f 100644
--- a/paper/sections/appendix.tex
+++ b/paper/sections/appendix.tex
@@ -38,6 +38,42 @@ the proof.
 \subsubsection{Approximate sparsity proof}
 \subsubsection{RE with high probability}
 
+\begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if
+    $ \forall\Delta\in C(S),\;
+        \|\E[H] - H]\|_\infty\leq \lambda $
+    and $\E[H]$ verifies the $(S,\gamma)$-(RE)
+    condition then:
+    \begin{equation}
+        \label{eq:foo}
+    \forall \Delta\in C(S),\;
+    \Delta H\Delta \geq
+    \Delta \E[H]\Delta(1-32s\lambda/\gamma)
+    \end{equation}
+    Indeed, $
+    |\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq
+    2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2
+    $.
+    Writing
+    $\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in
+    T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq
+    \frac{4(M+2)}{\alpha}$.
+    Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this
+    implies:
+    \begin{displaymath}
+        \P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq
+        2\exp\left(-\frac{n\alpha\lambda^2}{4(M+2)} + 2\log m\right)
+    \end{displaymath}
+    Thus, if we take $\lambda=\sqrt{\frac{12(M+2)\log m}{\alpha
+    n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least
+    $1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq
+    \frac{M+2}{21\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies
+    $
+    \forall \Delta\in C(S),\;
+    \Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta,
+    $ w.p. at least $1-e^{-n^{\delta}\log m}$ and the conclusion of
+    Proposition~\ref{prop:fi} follows.
+\end{proof}
+
 \subsection{Other continuous time processes binned to ours: proportional
 hazards model}
 
diff --git a/paper/sections/model.tex b/paper/sections/model.tex
index b478e94..acec974 100644
--- a/paper/sections/model.tex
+++ b/paper/sections/model.tex
@@ -192,7 +192,7 @@ Generalized Linear Cascade model with inverse link function $f:z\mapsto
 1-e^{-\epsilon\cdot z}$.
 
 \subsubsection{Logistic Cascades}
-
+\label{sec:logistic_cascades}
 Consider the specific case of ``logistic cascades'' (where $f$ is the logistic
 function). This model captures the idea that there is a threshold around which
 each additional neighbor's contribution becomes significant: logistic
@@ -201,10 +201,7 @@ Threshold model~\cite{Kempe:03}. As we will see later in the
 analysis, the Hessian of our optimization program simplifies in the case of a
 logistic inverse link function to $\nabla^2\mathcal{L}(\theta^*) =
 \frac{1}{|\mathcal{T}|}XX^T$ where $X$ is the observation matrix $[x^1 \ldots
-x^\mathcal{|T|}]$. The (RE)-condition we introduce subsequently is then
-equivalent to the assumption made in the Lasso analysis of~\cite{bickel:2009}.
-
-
+x^\mathcal{|T|}]$.
 
 % \subsection{The Linear Threshold Model}
 
@@ -234,10 +231,10 @@ equivalent to the assumption made in the Lasso analysis of~\cite{bickel:2009}.
 
 \begin{figure}
   \includegraphics[scale=.4]{figures/drawing.pdf}
-  \caption{Illustration of the sparse-recovery approach: the measurement matrix
-  is known, as is a Bernoulli realization of the matrix-vector product via the
-non-linear transformation induced by $f$. Our objective is to recover the
-unknown weight vector $\theta_i$ for each node $i$.}
+  \caption{Illustration of the sparse-recovery approach. Our objective is to
+  recover the unknown weight vector $\theta_j$ for each node $j$. We observe a
+Bernoulli realization of the $f$ transform of the matrix-vector product, where
+the measurement matrix encodes which nodes are ``contagious''}
 \end{figure}
 
 
diff --git a/paper/sections/results.tex b/paper/sections/results.tex
index 33bff55..a0d3159 100644
--- a/paper/sections/results.tex
+++ b/paper/sections/results.tex
@@ -3,7 +3,7 @@ assumptions our program~\eqref{eq:pre-mle} recovers the true parameter
 $\theta_i$ of the cascade model. Furthermore, if we can estimate $\theta_i$ to
 a sufficiently good accuracy, it is then possible to recover the support of
 $\theta_i$ by simple thresholding, which provides a solution to the standard
-Graph Inference problem.
+Network Inference problem.
 
 We will first give results in the exactly sparse setting in which $\theta_i$
 has a support of size exactly $s$. We will then relax this sparsity constraint
@@ -24,8 +24,8 @@ write $S\defeq\text{supp}(\theta^*)$ and $s=|S|$. Recall, that $\theta_i$ is the
 vector of weights for all edges \emph{directed at} the node we are solving for.
 In other words, $S$ is the set of all nodes susceptible to influence node $i$,
 also referred to as its parents. Our main theorem will rely on the now standard
-\emph{restricted eigenvalue condition}.
-{\color{red} Add reference}
+\emph{restricted eigenvalue condition} introduced
+by~\cite{bickel2009simultaneous}.
 
 \begin{definition}
     Let $\Sigma\in\mathcal{S}_m(\reals)$ be a real symmetric matrix and $S$ be
@@ -119,13 +119,12 @@ $\hat{\theta}_\lambda$ is the solution of \eqref{eq:pre-mle}:
 
 To prove Theorem~\ref{thm:main}, we apply Lemma~\ref{lem:negahban} with
 $\tau_{\mathcal{L}}=0$. Since $\mathcal{L}$ is twice differentiable and convex,
-assumption \eqref{eq:rc} with $\kappa_{\mathcal{L}}=\frac{\gamma}{2}$ is
-implied by the (RE)-condition.. The upper bound
-on the $\ell_{\infty}$ norm of $\nabla\mathcal{L}(\theta^*)$ is given by
-Lemma~\ref{lem:ub}.
+assumption \eqref{eq:rc} with $\kappa_{\mathcal{L}}=\frac{\gamma}{2}$ is implied
+by the (RE)-condition. For a good convergence rate, we must find the smallest
+possible value of $\lambda$ such that $\lambda \geq 2
+\|\nabla\mathcal{L}\theta^*\|_{\infty}$.  The upper bound on the $\ell_{\infty}$
+norm of $\nabla\mathcal{L}(\theta^*)$ is given by Lemma~\ref{lem:ub}.
 
-{\color{red} explain usefulness/interpretation and contribution}
-{\color{red} Sketch proof, full proof in appendix}
 \begin{lemma}
 \label{lem:ub}
 Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$:
@@ -137,12 +136,12 @@ Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$:
 \end{displaymath}
 \end{lemma}
 
-
-
-Note how the proof of Lemma~\ref{lem:ub} relied crucially on Azuma-Hoeffding's
-inequality, which allows us to handle correlated observations. We now show how
+The proof of Lemma~\ref{lem:ub} relies crucially on Azuma-Hoeffding's
+inequality, which allows us to handle correlated observations. This departs from
+the usual assumptions made in sparse recovery settings, where the sequence of
+measurements are assumed to be independent from one another. We now show how
 to use Theorem~\ref{thm:main} to recover the support of $\theta^*$, that is, to
-solve the Graph Inference problem.
+solve the Network Inference problem.
 
 \begin{corollary}
 \label{cor:variable_selection}
@@ -168,7 +167,7 @@ contradiction. Therefore we get no false positives. If $\theta_i^* > \eta +
 \end{proof}
 
 Assuming we know a lower bound $\alpha$ on $\Theta_{i,j}$,
-Corollary~\ref{cor:variable_selection} can be applied to the Graph Inference
+Corollary~\ref{cor:variable_selection} can be applied to the Network Inference
 problem in the following manner: pick $\epsilon = \frac{\eta}{2}$ and $\eta
 = \frac{\alpha}{3}$, then $S_{\eta+\epsilon}^* = S$ provided that
 $n=\Omega\left(\frac{s\log m}{\alpha^3\gamma^2}\right)$. That is, the support
@@ -198,7 +197,6 @@ In other words, the closer $\theta^*$ is to being sparse, the smaller the
 price, and we recover the results of Section~\ref{sec:main_theorem} in the
 limit of exact sparsity. These results are formalized in the following theorem,
 which is also a consequence of Theorem 1 in \cite{Negahban:2009}.
-{\color{red} Include full proof in appendix}
 
 \begin{theorem}
 \label{thm:approx_sparse}
@@ -238,9 +236,10 @@ then similarly: ${\cal S}^*_{\eta + \epsilon} \subset \hat {\cal S}_\eta
 \subsection{Restricted Eigenvalue Condition}
 \label{sec:re}
 
-In this section, we discuss the main assumption of Theorem~\ref{thm:main}
-namely the restricted eigenvalue condition. We then compare to the
-irrepresentability condition considered in \cite{Daneshmand:2014}.
+In this section, we discuss the main assumption of Theorem~\ref{thm:main} namely
+the restricted eigenvalue condition introduced by~\cite{bickel2009simultaneous}.
+We then compare to the irrepresentability condition considered in
+\cite{Daneshmand:2014}.
 
 \paragraph{Interpretation}
 
@@ -248,26 +247,30 @@ There exists a large class of sufficient conditions under which sparse recovery
 is achievable in the context of regularized estimation. A good survey of these
 different assumptions can be found in \cite{vandegeer:2009}.
 
-The restricted eigenvalue condition introduce in \cite{bickel:2009} is one of
-the weakest such assumption. It can be interpreted as a restricted form of
-non-degeneracy. Since we apply it to the Hessian of the log-likelihood function
-$\nabla^2 \mathcal{L}(\theta)$, it essentially reduces to a form of restricted
-strong convexity, that Lemma~\ref{lem:negahban} ultimately relies on.
-
-In our case we have:
+The restricted eigenvalue condition is one of the weakest such assumption. It
+can be interpreted as a restricted form of non-degeneracy. Since we apply it to
+the Hessian of the log-likelihood function $\nabla^2 \mathcal{L}(\theta)$, it
+essentially reduces to a form of restricted strong convexity, that
+Lemma~\ref{lem:negahban} ultimately relies on. Observe that the Hessian of
+$\mathcal{L}$ can be seen as a re-weighted Gram matrix of the observations:
 \begin{multline*}
     \nabla^2\mathcal{L}(\theta^*)
     = \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}}x^t(x^t)^T
     \bigg[x_i^{t+1}\frac{f''f-f'^2}{f^2}(\inprod{\theta^*}{x^t})\\
     -(1-x_i^{t+1})\frac{f''(1-f) + f'^2}{(1-f)^2}(\inprod{\theta^*}{x^t})\bigg]
 \end{multline*}
-Observe that the Hessian of $\mathcal{L}$ can be seen as
-a re-weighted Gram matrix of the observations. In other words, the restricted
-eigenvalue condition sates that the observed set of active nodes are not
-too collinear with each other.
-
-{\color{red} if the function is strictly log-convex, then equivalent -> explain
-what the gram matrix is (explanation)}
+If $f$ and $1-f$ are $c$-strictly log-convex~\cite{bagnoli2005log} for $c>0$,
+then
+\begin{displaymath}
+  \min\left(\frac{f''f-f'^2}{f^2}(x), \frac{f''(1-f) + f'^2}{(1-f)^2}(x) \right)
+  \geq c
+\end{displaymath}
+and the $(S, \gamma)$-({\bf RE}) condition on the Hessian in
+Theorem~\ref{thm:main} and Theorem~\ref{thm:approx_sparse} reduces to a
+condition on the \emph{gram matrix} of the observations $X^T X =
+\frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}}x^t(x^t)^T$ with $\gamma \leftarrow
+\gamma\cdot c$. The (RE)-condition is then equivalent to the assumption made in
+the Lasso analysis of~\cite{bickel:2009}.
 
 \paragraph{(RE) with high probability}
 
@@ -278,17 +281,24 @@ this probabilistic model. Several recent papers show that large classes of
 correlated designs obey the restricted eigenvalue property with high
 probability \cite{raskutti:10, rudelson:13}.
 
-In our case, we can show that if (RE)-condition holds for the expected Hessian
-matrix $\E[\nabla^2\mathcal{L}(\theta^*)]$, then it holds for the finite sample
-Hessian matrix $\nabla^2\mathcal{L}(\theta^*)$ with high probability. Note that
-the expected Hessian matrix is exactly the Fisher Information matrix of the
-generalized linear cascade model which captures the amount of information about
-$\theta^*$ conveyed by the random observations. Therefore, under an assumption
-which only involves the probabilistic model and not the actual observations,
-Proposition~\ref{prop:fi} allows us to draw the same conclusion as in
-Theorem~\ref{thm:main}.
+The (RE)-condition is well-behaved in the following sense: if it holds for the
+expected Hessian matrix $\E[\nabla^2\mathcal{L}(\theta^*)]$, then it holds for
+the finite sample Hessian matrix $\nabla^2\mathcal{L}(\theta^*)$ with high
+probability.
 
-We will need the following additional assumptions on the inverse link function $f$:
+Note that the expected Hessian matrix is exactly the Fisher
+Information matrix of the generalized linear cascade model which captures the
+amount of information about $\theta^*$ conveyed by the random observations.
+The (RE)-condition has an even more natural interpretation when $f$ and $1-f$
+are strictly log-convex, since the expected \emph{gram matrix} $A \equiv
+\mathbb{E}[X^T X]$ is a matrix whose entry $a_{i,j}$ is the average number of
+times node $i$ and node $j$ are infected at the same time in the cascade
+process, and $a_{i,i}$ is the average number of times node $i$ is infected.
+
+Therefore, under an assumption which only involves the probabilistic model and
+not the actual observations, Proposition~\ref{prop:fi} allows us to draw the
+same conclusion as in Theorem~\ref{thm:main}. We will need the following
+additional assumptions on the inverse link function $f$:
 \begin{equation}
     \tag{LF2}
     \|f'\|_{\infty} \leq M
@@ -309,73 +319,36 @@ again non restrictive in the (IC) model and (V) model.
     condition, w.p $\geq 1-e^{-n^\delta\log m}$.
 \end{proposition}
 
-{\color{red} sketch proof, full (AND BETTER) proof in appendix}
-\begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if
-    $ \forall\Delta\in C(S),\;
-        \|\E[H] - H]\|_\infty\leq \lambda $
-    and $\E[H]$ verifies the $(S,\gamma)$-(RE)
-    condition then:
-    \begin{equation}
-        \label{eq:foo}
-    \forall \Delta\in C(S),\;
-    \Delta H\Delta \geq
-    \Delta \E[H]\Delta(1-32s\lambda/\gamma)
-    \end{equation}
-    Indeed, $
-    |\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq
-    2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2
-    $.
-    Writing
-    $\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in
-    T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq
-    \frac{4(M+2)}{\alpha}$.
-    Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this
-    implies:
-    \begin{displaymath}
-        \P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq
-        2\exp\left(-\frac{n\alpha\lambda^2}{4(M+2)} + 2\log m\right)
-    \end{displaymath}
-    Thus, if we take $\lambda=\sqrt{\frac{12(M+2)\log m}{\alpha
-    n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least
-    $1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq
-    \frac{M+2}{21\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies
-    $
-    \forall \Delta\in C(S),\;
-    \Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta,
-    $ w.p. at least $1-e^{-n^{\delta}\log m}$ and the conclusion of
-    Proposition~\ref{prop:fi} follows.
-\end{proof}
-
 Observe that the number of measurements required in Proposition~\ref{prop:fi}
 is now quadratic in $s$. If we only keep the first measurement from each
 cascade which are independent, we can apply Theorem 1.8 from
 \cite{rudelson:13}, lowering the number of required cascades to $s\log m \log^3(
 s\log m)$.
 
-\paragraph{(RE) vs Irrepresentability Condition}
+%\paragraph{(RE) vs Irrepresentability Condition}
 
-\cite{Daneshmand:2014} rely on an `incoherence' condition on the hessian of the
-likelihood function also known as the {\it (S,s)-irrepresentability} condition.
+%\cite{Daneshmand:2014} rely on an `incoherence' condition on the hessian of the
+%likelihood function also known as the {\it (S,s)-irrepresentability} condition.
 
-\begin{comment}
-\begin{definition}
-Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and
-$S^c$ be the set of indices of all the parents and non-parents respectively and
-$Q_{S,S}$, $Q_{S^c,S}$, $Q_{S, S^c}$, and $Q_{S^c, S^c}$ the induced
-sub-matrices. Consider the following constant:
-\begin{equation}
-\nu_{\text{irrepresentable}}(S) \defeq \max_{\tau \in \mathbb{R}^p \ :\ \| \tau
-\|_{\infty} \leq 1} \|Q_{S^c, S} Q_{S, S}^{-1} \tau\|_{\infty}
-\end{equation}
-The (S,s)-irrepresentability holds if $\nu_{\text{irrepresentable}}(S) < 1 -
-\epsilon$ for $\epsilon > 0$
-\end{definition}
-\end{comment}
+%\begin{comment}
+%\begin{definition}
+%Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and
+%$S^c$ be the set of indices of all the parents and non-parents respectively and
+%$Q_{S,S}$, $Q_{S^c,S}$, $Q_{S, S^c}$, and $Q_{S^c, S^c}$ the induced
+%sub-matrices. Consider the following constant:
+%\begin{equation}
+%\nu_{\text{irrepresentable}}(S) \defeq \max_{\tau \in \mathbb{R}^p \ :\ \| \tau
+%\|_{\infty} \leq 1} \|Q_{S^c, S} Q_{S, S}^{-1} \tau\|_{\infty}
+%\end{equation}
+%The (S,s)-irrepresentability holds if $\nu_{\text{irrepresentable}}(S) < 1 -
+%\epsilon$ for $\epsilon > 0$
+%\end{definition}
+%\end{comment}
 
-It is possible to construct examples where the (RE) condition holds but not the
-irrepresentability condition \cite{vandegeer:2009}. Theorem 9.1 from the same
-paper shows that a `strong' irrepresentability condition directly {\it implies}
-the {\bf(RE)} condition for $\ell_2$-recovery.
+%It is possible to construct examples where the (RE) condition holds but not the
+%irrepresentability condition \cite{vandegeer:2009}. Theorem 9.1 from the same
+%paper shows that a `strong' irrepresentability condition directly {\it implies}
+%the {\bf(RE)} condition for $\ell_2$-recovery.
 
 \begin{comment}
 \begin{proposition}
@@ -388,53 +361,53 @@ the smallest eigenvalue of $Q^*_{S,S}$, on which the results of
 \end{proposition}
 \end{comment}
 
-\begin{comment}
-Furthermore, recent papers \cite{vandegeer:2011}, \cite{Zou:2006}, argue that
-the irrepresentability condition is unrealistic in situations where there is
-a correlation between variables. Consider the following simplified example from
-\cite{vandegeer:2011}:
-\begin{equation}
-\nonumber
-\left(
-\begin{array}{cccc}
-I_s & \rho J \\
-\rho J & I_s \\
-\end{array}
-\right)
-\end{equation}
-where $I_s$ is the $s \times s$ identity matrix, $J$ is the all-ones matrix and
-$\rho \in \mathbb{R}^+$. It is easy to see that $\nu_{\text{irrepresentable}}(S)
-= \rho s$ and $\lambda_{\min}(Q) \geq 1 - \rho$, such that for any $\rho >
-\frac{1}{s}$ and $\rho < 1$, the restricted eigenvalue holds trivially but the
-(S,s)-irrepresentability does not hold.
+%\begin{comment}
+%Furthermore, recent papers \cite{vandegeer:2011}, \cite{Zou:2006}, argue that
+%the irrepresentability condition is unrealistic in situations where there is
+%a correlation between variables. Consider the following simplified example from
+%\cite{vandegeer:2011}:
+%\begin{equation}
+%\nonumber
+%\left(
+%\begin{array}{cccc}
+%I_s & \rho J \\
+%\rho J & I_s \\
+%\end{array}
+%\right)
+%\end{equation}
+%where $I_s$ is the $s \times s$ identity matrix, $J$ is the all-ones matrix and
+%$\rho \in \mathbb{R}^+$. It is easy to see that $\nu_{\text{irrepresentable}}(S)
+%= \rho s$ and $\lambda_{\min}(Q) \geq 1 - \rho$, such that for any $\rho >
+%\frac{1}{s}$ and $\rho < 1$, the restricted eigenvalue holds trivially but the
+%(S,s)-irrepresentability does not hold.
 
 
-\begin{lemma}
-Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in
-\mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal
-R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where
-$\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar
-{\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that
-${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0,
-\; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* +
-\Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle
-\geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal
-M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$.
-Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where
-${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda -
-\theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) +
-\frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal
-R}(\theta^*_{{\cal M}^\perp}\}$$
-\end{lemma}
+%\begin{lemma}
+%Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in
+%\mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal
+%R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where
+%$\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar
+%{\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that
+%${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0,
+%\; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* +
+%\Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle
+%\geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal
+%M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$.
+%Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where
+%${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda -
+%\theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) +
+%\frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal
+%R}(\theta^*_{{\cal M}^\perp}\}$$
+%\end{lemma}
 
-\subsection{The Independent Cascade Model}
+%\subsection{The Independent Cascade Model}
 
-Recall that to cast the Independent Cascade model as a Generalized Linear
-Cascade, we performed the following change of variables: $\forall i,j
-\ \Theta_{i,j} = \log(1 - p_{i,j})$. The previous results hold \emph{a priori}
-for the ``effective'' parameter $\Theta$. The following lemma shows they also
-hold for the original infection probabilities $p_{i,j}$:
+%Recall that to cast the Independent Cascade model as a Generalized Linear
+%Cascade, we performed the following change of variables: $\forall i,j
+%\ \Theta_{i,j} = \log(1 - p_{i,j})$. The previous results hold \emph{a priori}
+%for the ``effective'' parameter $\Theta$. The following lemma shows they also
+%hold for the original infection probabilities $p_{i,j}$:
 
-The results of sections~\ref{sec:main_theorem} and \ref{sec:relaxing_sparsity}
-therefore hold for the original transition probabilities $p$.
-\end{comment}
+%The results of sections~\ref{sec:main_theorem} and \ref{sec:relaxing_sparsity}
+%therefore hold for the original transition probabilities $p$.
+%\end{comment}
diff --git a/paper/sparse.bib b/paper/sparse.bib
index 1dd7589..4aee818 100644
--- a/paper/sparse.bib
+++ b/paper/sparse.bib
@@ -14,7 +14,8 @@ year = {2006},
 @inproceedings{GomezRodriguez:2010,
  author = {Gomez Rodriguez, Manuel and Leskovec, Jure and Krause, Andreas},
  title = {Inferring Networks of Diffusion and Influence},
- booktitle = {Proceedings of the 16th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
+ booktitle = {Proceedings of the 16th ACM SIGKDD International Conference on
+   Knowledge Discovery and Data Mining},
  series = {KDD '10},
  year = {2010},
  isbn = {978-1-4503-0055-1},
@@ -141,7 +142,8 @@ fjournal = "Electronic Journal of Statistics",
 journal = "Electron. J. Statist.",
 pages = "688--749",
 publisher = "The Institute of Mathematical Statistics and the Bernoulli Society",
-title = "The adaptive and the thresholded Lasso for potentially misspecified models (and a lower bound for the Lasso)",
+title = "The adaptive and the thresholded Lasso for potentially misspecified
+  models (and a lower bound for the Lasso)",
 url = "http://dx.doi.org/10.1214/11-EJS624",
 volume = "5",
 year = "2011"
@@ -164,8 +166,8 @@ URL = {http://dx.doi.org/10.1198/016214506000000735},
                Jason N. Laska and
                Petros T. Boufounos and
                Richard G. Baraniuk},
-  title     = {Robust 1-Bit Compressive Sensing via Binary Stable Embeddings of Sparse
-               Vectors},
+  title     = {Robust 1-Bit Compressive Sensing via Binary Stable Embeddings of
+    Sparse Vectors},
   journal   = {{IEEE} Transactions on Information Theory},
   volume    = {59},
   number    = {4},
@@ -197,7 +199,8 @@ URL = {http://dx.doi.org/10.1198/016214506000000735},
   author    = {Ankit Gupta and
                Robert Nowak and
                Benjamin Recht},
-  title     = {Sample complexity for 1-bit compressed sensing and sparse classification},
+  title     = {Sample complexity for 1-bit compressed sensing and sparse
+    classification},
   booktitle = {{IEEE} International Symposium on Information Theory, {ISIT} 2010,
                June 13-18, 2010, Austin, Texas, USA, Proceedings},
   pages     = {1553--1557},
@@ -290,8 +293,8 @@ year = "2009"
   author    = {Eric Price and
                David P. Woodruff},
   title     = {{(1} + eps)-Approximate Sparse Recovery},
-  booktitle = {{IEEE} 52nd Annual Symposium on Foundations of Computer Science, {FOCS}
-               2011, Palm Springs, CA, USA, October 22-25, 2011},
+  booktitle = {{IEEE} 52nd Annual Symposium on Foundations of Computer Science,
+    {FOCS} 2011, Palm Springs, CA, USA, October 22-25, 2011},
   pages     = {295--304},
   year      = {2011},
   crossref  = {DBLP:conf/focs/2011},
@@ -318,8 +321,8 @@ year = "2009"
 @inproceedings{pw12,
   author    = {Eric Price and
                David P. Woodruff},
-  title     = {Applications of the Shannon-Hartley theorem to data streams and sparse
-               recovery},
+  title     = {Applications of the Shannon-Hartley theorem to data streams and
+    sparse recovery},
   booktitle = {Proceedings of the 2012 {IEEE} International Symposium on Information
                Theory, {ISIT} 2012, Cambridge, MA, USA, July 1-6, 2012},
   pages     = {2446--2450},
@@ -481,9 +484,11 @@ year = "2009"
 }
 
 @article{zhang2014,
-  title={Confidence intervals for low dimensional parameters in high dimensional linear models},
+  title={Confidence intervals for low dimensional parameters in high dimensional
+    linear models},
   author={Zhang, Cun-Hui and Zhang, Stephanie S},
-  journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
+  journal={Journal of the Royal Statistical Society: Series B (Statistical
+      Methodology)},
   volume={76},
   number={1},
   pages={217--242},
@@ -492,7 +497,8 @@ year = "2009"
 }
 
 @article{javanmard2014,
-  title={Confidence intervals and hypothesis testing for high-dimensional regression},
+  title={Confidence intervals and hypothesis testing for high-dimensional
+    regression},
   author={Javanmard, Adel and Montanari, Andrea},
   journal={The Journal of Machine Learning Research},
   volume={15},
@@ -514,7 +520,8 @@ year = "2009"
 }
 
 @article{candes2006near,
-  title={Near-optimal signal recovery from random projections: Universal encoding strategies?},
+  title={Near-optimal signal recovery from random projections: Universal
+    encoding strategies?},
   author={Candes, Emmanuel J and Tao, Terence},
   journal={Information Theory, IEEE Transactions on},
   volume={52},
@@ -523,3 +530,23 @@ year = "2009"
   year={2006},
   publisher={IEEE}
 }
+
+@article{bickel2009simultaneous,
+  title={Simultaneous analysis of Lasso and Dantzig selector},
+  author={Bickel, Peter J and Ritov, Ya'acov and Tsybakov, Alexandre B},
+  journal={The Annals of Statistics},
+  pages={1705--1732},
+  year={2009},
+  publisher={JSTOR}
+}
+
+@article{bagnoli2005log,
+  title={Log-concave probability and its applications},
+  author={Bagnoli, Mark and Bergstrom, Ted},
+  journal={Economic theory},
+  volume={26},
+  number={2},
+  pages={445--469},
+  year={2005},
+  publisher={Springer}
+}