3 files changed, 37 insertions, 42 deletions
diff --git a/paper/sections/appendix.tex b/paper/sections/appendix.tex
index 32e2775..22b87c2 100644
--- a/paper/sections/appendix.tex
+++ b/paper/sections/appendix.tex
@@ -5,11 +5,13 @@ the paper.
 
 \subsection{Proofs of Section~\ref{sec:results}}
 
-\paragraph{Bounded gradient} The proof of Lemma~\ref{lem:ub} giving an upper
-bound on the $\ell_{\infty}$ norm of the gradient of the likelihood function is
-given below.
+\begin{proof}[Proof of Lemma~\ref{lem:transform}]
+Using the inequality $\forall x>0, \; \log x \geq 1 - \frac{1}{x}$, we have
+$|\log (\frac{1}{1 - p}) - \log (\frac{1}{1-p'})| \geq \max(1 - \frac{1-p}{1-p'},
+1 - \frac{1-p'}{1-p}) \geq \max( p-p', p'-p)$.
+\end{proof}
 
-\begin{proof}
+\begin{proof}[Proof of Lemma~\ref{lem:ub}]
 The gradient of $\mathcal{L}$ is given by:
 \begin{multline*}
     \nabla \mathcal{L}(\theta^*) =
@@ -42,6 +44,16 @@ Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes
 the proof.
 \end{proof}
 
+\begin{proof}[Proof of Corollary~\ref{cor:variable_selection}]
+By choosing $\delta = 0$, if $ n > \frac{9s\log m}{\alpha\gamma^2\epsilon^2}$,
+then $\|\hat \theta-\theta^*\|_2 < \epsilon < \eta$ with probability
+$1-\frac{1}{m}$. If $\theta_i^* = 0$ and $\hat \theta > \eta$, then $\|\hat
+\theta - \theta^*\|_2 \geq |\hat \theta_i-\theta_i^*| > \eta$, which is a
+contradiction. Therefore we get no false positives. If $\theta_i^* > \eta +
+\epsilon$, then $|\hat{\theta}_i- \theta_i^*| < \epsilon \implies \theta_j >
+\eta$ and we get all strong parents.
+\end{proof}
+
 %\subsubsection{Approximate sparsity proof}
 \paragraph{(RE) with high probability} We now prove Proposition~\ref{prop:fi}.
 The proof mostly relies on showing that the Hessian of likelihood function
@@ -113,6 +125,7 @@ C)$. These two bounds together imply the theorem.
 \begin{figure}
   \centering
   \includegraphics[scale=.4]{figures/n_nodes_running_time.pdf}
+  \vspace{-1em}
   \caption{Running time analysis for estimating the parents of a \emph{single
     node} on a Barabasi-Albert graph as a function of the number of nodes in
     the graph. The parameter $k$ (number of nodes each new node is attached to)
@@ -120,6 +133,7 @@ C)$. These two bounds together imply the theorem.
     the edge weights are chosen uniformly at random in $[.2,.7]$. The
   penalization parameter $\lambda$ is chosen equal to $.1$.}
     \label{fig:running_time_n_nodes}
+    \vspace{-1em}
 \end{figure}
 
 \begin{figure}
@@ -130,6 +144,7 @@ C)$. These two bounds together imply the theorem.
     observed cascades. The parameters defining the graph were set as in
 Figure~\ref{fig:running_time_n_nodes}.}
     \label{fig:running_time_n_cascades}
+    \vspace{-1em}
 \end{figure}
 
 We include here a running time analysis of our algorithm. In
diff --git a/paper/sections/model.tex b/paper/sections/model.tex
index fd25c27..532ee5e 100644
--- a/paper/sections/model.tex
+++ b/paper/sections/model.tex
@@ -140,13 +140,9 @@ the recovery error on $\Theta_j$ is an upper bound on the error on the original
 $p_j$ parameters.
 
 \begin{lemma}
+    \label{lem:transform}
     $\|\hat{\theta} - \theta^* \|_2 \geq \|\hat{p} - p^*\|_2$.
 \end{lemma}
-\begin{proof}
-Using the inequality $\forall x>0, \; \log x \geq 1 - \frac{1}{x}$, we have
-$|\log (\frac{1}{1 - p}) - \log (\frac{1}{1-p'})| \geq \max(1 - \frac{1-p}{1-p'},
-1 - \frac{1-p'}{1-p}) \geq \max( p-p', p'-p)$.
-\end{proof}
 
 \subsubsection{The Linear Voter Model}
 
@@ -334,7 +330,7 @@ $\mathcal{L}_i$ is equal to $-\infty$ when the parameters are outside of the
 domain of definition of the models, these contraints do not need to appear
 explicitly in the optimization program.
 
-In the specific case of the voter model the constraint $\sum_j \Theta_{i,j}
+In the specific case of the voter model, the constraint $\sum_j \Theta_{i,j}
 = 1$ will not necessarily be verified by the estimator obtained in
 \eqref{eq:pre-mle}. In some applications, the experimenter might not need this
 constraint to be verified, in which case the results in
diff --git a/paper/sections/results.tex b/paper/sections/results.tex
index 2292ce3..cab27a7 100644
--- a/paper/sections/results.tex
+++ b/paper/sections/results.tex
@@ -139,16 +139,6 @@ S}^*_{\eta + \epsilon} \subseteq \hat {\cal S}_\eta \subseteq {\cal S}^*$. In
 other words we recover all `strong' parents and no `false' parents.
 \end{corollary}
 
-\begin{proof}
-By choosing $\delta = 0$, if $ n > \frac{9s\log m}{\alpha\gamma^2\epsilon^2}$,
-then $\|\hat \theta-\theta^*\|_2 < \epsilon < \eta$ with probability
-$1-\frac{1}{m}$. If $\theta_i^* = 0$ and $\hat \theta > \eta$, then $\|\hat
-\theta - \theta^*\|_2 \geq |\hat \theta_i-\theta_i^*| > \eta$, which is a
-contradiction. Therefore we get no false positives. If $\theta_i^* > \eta +
-\epsilon$, then $|\hat{\theta}_i- \theta_i^*| < \epsilon \implies \theta_j >
-\eta$ and we get all strong parents.
-\end{proof}
-
 Assuming we know a lower bound $\alpha$ on $\Theta_{i,j}$,
 Corollary~\ref{cor:variable_selection} can be applied to the Network Inference
 problem in the following manner: pick $\epsilon = \frac{\eta}{2}$ and $\eta
@@ -199,9 +189,11 @@ solving \eqref{eq:pre-mle} for $\lambda \defeq 2\sqrt{\frac{\log m}{\alpha n^{1
 \end{align*}
 \end{theorem}
 
-As before, edge recovery is a consequence of upper-bounding $\|\theta^* - \hat
-\theta\|_2$
+As in Corollary~\ref{cor:variable_selection}, an edge recovery guarantee can be
+derived from  Theorem~\ref{thm:approx_sparse} in the case of approximate
+sparsity.
 
+\begin{comment}
 \begin{corollary}
     Under the same assumptions as Theorem~\ref{thm:approx_sparse}, if the number
     of measurements verifies: \begin{equation}
@@ -212,22 +204,18 @@ As before, edge recovery is a consequence of upper-bounding $\|\theta^* - \hat
 then: ${\cal S}^*_{\eta + \epsilon} \subset \hat {\cal S}_\eta
 \subset {\cal S}^*$ w.p. at least $1-\frac{1}{m}$.
 \end{corollary}
+\end{comment}
 
 \subsection{Restricted Eigenvalue Condition}
 \label{sec:re}
 
-In this section, we discuss the main assumption of Theorem~\ref{thm:main} namely
-the restricted eigenvalue condition introduced by~\cite{bickel2009simultaneous}.
-
-\paragraph{Interpretation}
-
 There exists a large class of sufficient conditions under which sparse recovery
 is achievable in the context of regularized estimation~\cite{vandegeer:2009}.
-The restricted eigenvalue condition is one of the weakest such assumption. It
-can be interpreted as a restricted form of non-degeneracy. Since we apply it to
-the Hessian of the log-likelihood function $\nabla^2 \mathcal{L}(\theta)$, it
-essentially reduces to a form of restricted strong convexity, that
-Lemma~\ref{lem:negahban} ultimately relies on.
+The restricted eigenvalue condition, introduced in \cite{bickel:2009}, is one
+of the weakest such assumption. It can be interpreted as a restricted form of
+non-degeneracy. Since we apply it to the Hessian of the log-likelihood function
+$\nabla^2 \mathcal{L}(\theta)$, it essentially reduces to a form of restricted
+strong convexity, that Lemma~\ref{lem:negahban} ultimately relies on.
 
 Observe that the Hessian of $\mathcal{L}$ can be seen as a re-weighted
 \emph{Gram matrix} of the observations:
@@ -238,16 +226,12 @@ Observe that the Hessian of $\mathcal{L}$ can be seen as a re-weighted
     -(1-x_i^{t+1})\frac{f''(1-f) + f'^2}{(1-f)^2}(\inprod{\theta^*}{x^t})\bigg]
 \end{multline*}
 If $f$ and $1-f$ are $c$-strictly log-convex~\cite{bagnoli2005log} for $c>0$,
-then
-\begin{displaymath}
-  \min\left(\frac{f''f-f'^2}{f^2}(x), \frac{f''(1-f) + f'^2}{(1-f)^2}(x) \right)
-  \geq c
-\end{displaymath}
-and the $(S, \gamma)$-({\bf RE}) condition on the Hessian in
-Theorem~\ref{thm:main} and Theorem~\ref{thm:approx_sparse} reduces to a
-condition on the \emph{Gram matrix} of the observations $X^T X =
-\frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}}x^t(x^t)^T$ for $\gamma' \defeq
-\gamma\cdot c$.
+then $ \min\left((\log f)'', (\log (1-f))'' \right) \geq c $. This implies that
+the $(S, \gamma)$-({\bf RE}) condition in Theorem~\ref{thm:main} and
+Theorem~\ref{thm:approx_sparse} reduces to a condition on the \emph{Gram
+matrix} of the observations $X^T
+X = \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}}x^t(x^t)^T$ for $\gamma'
+\defeq \gamma\cdot c$.
 
 \paragraph{(RE) with high probability}