3 files changed, 34 insertions, 36 deletions
diff --git a/paper/sections/experiments.tex b/paper/sections/experiments.tex
index cacc882..7336f93 100644
--- a/paper/sections/experiments.tex
+++ b/paper/sections/experiments.tex
@@ -36,6 +36,7 @@
 graph which is: (d) exactly sparse (e) non-exactly sparse, as a function of the
 number of cascades $n$. Figure (f) plots the F$1$-score for the Watts-Strogatz
 graph as a function of $p_{init}$.}~\label{fig:four_figs}
+\vspace{-2em}
 \end{table*}
 
 In this section, we validate empirically the results and assumptions of
diff --git a/paper/sections/model.tex b/paper/sections/model.tex
index 532ee5e..5d3a232 100644
--- a/paper/sections/model.tex
+++ b/paper/sections/model.tex
@@ -130,14 +130,12 @@ Defining $\Theta_{i,j} \defeq \log(\frac{1}{1-p_{i,j}})$, this can be rewritten
 \end{align*}
 
 Therefore, the independent cascade model is a Generalized Linear Cascade model
-with inverse link function $f : z \mapsto 1 - e^{-z}$.
-
-Note that to write the Independent Cascade Model as a Generalized Linear
-Cascade Model, we had to introduce the change of variable $\Theta_{i,j}
-= \log(\frac{1}{1-p_{i,j}})$. The recovery results in Section~\ref{sec:results}
-pertain to the $\Theta_j$ parameters. Fortunately, the following lemma shows that
-the recovery error on $\Theta_j$ is an upper bound on the error on the original
-$p_j$ parameters.
+with inverse link function $f : z \mapsto 1 - e^{-z}$. Note that to write the
+Independent Cascade Model as a Generalized Linear Cascade Model, we had to
+introduce the change of variable $\Theta_{i,j} = \log(\frac{1}{1-p_{i,j}})$.
+The recovery results in Section~\ref{sec:results} pertain to the $\Theta_j$
+parameters. Fortunately, the following lemma shows that the recovery error on
+$\Theta_j$ is an upper bound on the error on the original $p_j$ parameters.
 
 \begin{lemma}
     \label{lem:transform}
@@ -181,12 +179,12 @@ Let $\text{Exp}(p)$ be an exponentially-distributed random variable of parameter
 and let $\Theta_{i,j}$ be the rate of transmission along directed edge $(i,j)$
 in the CICE model.  By the memoryless property of the exponential, if $X^k_j
 \neq 1$:
-\begin{align*}
-  \mathbb{P}(X^{k+1}_j = 1 | X^k) & = \mathbb{P}(\min_{i \in {\cal N}(j)}
+\begin{multline*}
+  \mathbb{P}(X^{k+1}_j = 1 | X^k) = \mathbb{P}(\min_{i \in {\cal N}(j)}
     \text{Exp}(\Theta_{i,j}) \leq \epsilon) \\
-    & = \mathbb{P}(\text{Exp}( \sum_{i=1}^m \Theta_{i,j} X^t_i) \leq \epsilon) \\
-  & = 1 - e^{- \epsilon \Theta_j \cdot X^t}
-\end{align*}
+    = \mathbb{P}(\text{Exp}( \sum_{i=1}^m \Theta_{i,j} X^t_i) \leq \epsilon)
+  = 1 - e^{- \epsilon \Theta_j \cdot X^t}
+\end{multline*}
 Therefore, the $\epsilon$-discretized CICE-induced process is a
 Generalized Linear Cascade model with inverse link function $f:z\mapsto
 1-e^{-\epsilon\cdot z}$.
@@ -194,10 +192,8 @@ Generalized Linear Cascade model with inverse link function $f:z\mapsto
 \subsubsection{Logistic Cascades}
 \label{sec:logistic_cascades}
 ``Logistic cascades'' is the specific case where the inverse link function is
-given by the logistic function:
-\begin{displaymath}
-    f(z) = \frac{1}{1+e^{-z + t}}
-\end{displaymath}
+given by the logistic function
+$f(z) = 1/(1+e^{-z + t})$.
 Intuitively, this captures the idea that there is a threshold $t$ such that
 when the sum of the parameters of the infected parents of a node is larger than
 the threshold, the probability of getting infected is close to one. This is
@@ -277,11 +273,11 @@ and:
 \end{multline*}
 
 In the case of the voter model, the measurements include all time steps until
-we reach the time horizon $T$ or the graph coalesces to a single state. In the
-case of the independent cascade model, the measurements include all time steps
-until node $i$ becomes contagious, after which its behavior is deterministic.
-Contrary to prior work, our results depend on the number of
-measurements and not the number of cascades.
+we reach the time horizon $T$ or the graph coalesces to a single state. For the
+independent cascade model, the measurements include all time steps until node
+$i$ becomes contagious, after which its behavior is deterministic.  Contrary to
+prior work, our results depend on the number of measurements and not the number
+of cascades.
 
 \paragraph{Regularity assumptions}
 
@@ -294,12 +290,13 @@ Model and Voter Model.
 
 Furthermore, the data-dependent bounds in Section~\ref{sec:main_theorem} will
 require the following regularity assumption on the inverse link function $f$:
+there exists $\alpha\in(0,1)$ such that
 \begin{equation}
   \tag{LF}
   \max \big\{ | (\log f)'(z_x) |, |(\log (1-f))'(z_x) | \big\}
   \leq \frac{1}{\alpha}
 \end{equation}
-for some $\alpha\in(0, 1)$ and for all $z_x\defeq\inprod{\theta^*}{x}$ such that
+for all $z_x\defeq\inprod{\theta^*}{x}$ such that
 $f(z_x)\notin\{0,1\}$.
 
 In the voter model, $\frac{f'(z)}{f(z)} = \frac{1}{z}$ and
diff --git a/paper/sections/results.tex b/paper/sections/results.tex
index cab27a7..e91cad4 100644
--- a/paper/sections/results.tex
+++ b/paper/sections/results.tex
@@ -242,22 +242,15 @@ this probabilistic model. Several recent papers show that large classes of
 correlated designs obey the restricted eigenvalue property with high
 probability \cite{raskutti:10, rudelson:13}.
 
-The {\bf(RE)}-condition is well-behaved in the following sense: if it holds for
-the expected Hessian matrix $\E[\nabla^2\mathcal{L}(\theta^*)]$, then it holds
-for the finite sample Hessian matrix $\nabla^2\mathcal{L}(\theta^*)$ with high
-probability.
+The {\bf(RE)}-condition has the following concentration property: if it holds
+for the expected Hessian matrix $\E[\nabla^2\mathcal{L}(\theta^*)]$, then it
+holds for the finite sample Hessian matrix $\nabla^2\mathcal{L}(\theta^*)$ with
+high probability.
 
-If $f$ and $1-f$ are strictly log-convex, then the previous observations yields
-the following natural interpretation of the {\bf(RE)}-condition: the expected
-\emph{Gram matrix} $A \equiv \mathbb{E}[X^T X]$ is a matrix whose entry
-$a_{i,j}$ is the average fraction of times node $i$ and node $j$ are infected
-at the same time during several runs of the cascade process. Note that the
-diagonal terms $a_{i,i}$ are simply the average fraction of times the
-corresponding node $i$ is infected.
 
 Therefore, under an assumption which only involves the probabilistic model and
-not the actual observations, Proposition~\ref{prop:fi} allows us to draw the
-same conclusion as in Theorem~\ref{thm:main}.
+not the actual observations, we can obtain the same conclusion as in
+Theorem~\ref{thm:main}:
 
 \begin{proposition}
     \label{prop:fi}
@@ -274,6 +267,13 @@ cascade, which are independent, we can apply Theorem 1.8 from
 \cite{rudelson:13}, lowering the number of required cascades to $s\log m \log^3(
 s\log m)$.
 
+If $f$ and $1-f$ are strictly log-convex, then the previous observations show
+that the quantity $\E[\nabla2\mathcal{L}(\theta^*)]$ in
+Proposition~\ref{prop:fi} can be replaced by the expected \emph{Gram matrix}:
+$A \equiv \mathbb{E}[X^T X]$. This matrix $A$ has a natural interpretation: the
+entry $a_{i,j}$ is the probability that node $i$ and node $j$ are infected at
+the same time during a cascade. In particular, the diagonal term $a_{i,i}$ is
+simply the probability that node $i$ is infected during a cascade.
 %\paragraph{(RE) vs Irrepresentability Condition}
 
 %\cite{Daneshmand:2014} rely on an `incoherence' condition on the hessian of the