3 files changed, 88 insertions, 80 deletions
diff --git a/paper/sections/appendix.tex b/paper/sections/appendix.tex
index ff4c5dd..afcf186 100644
--- a/paper/sections/appendix.tex
+++ b/paper/sections/appendix.tex
@@ -1,10 +1,45 @@
 \subsection{Proof for different lemmas}
 
 \subsubsection{Bounded gradient}
+
+\begin{proof}
+The gradient of $\mathcal{L}$ is given by:
+\begin{multline*}
+    \nabla \mathcal{L}(\theta^*) =
+    \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
+        x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\
+    - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg]
+\end{multline*}
+
+Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of
+$\nabla\mathcal{L}(\theta^*)$.  Writing
+$\partial_j\mathcal{L}(\theta^*)
+= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
+$\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
+= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.
+
+Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq
+\frac{1}{\alpha}$ and we can apply Azuma's inequality to $Z_t$:
+\begin{displaymath}
+    \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
+    2\exp\left(\frac{-\lambda^2\alpha}{2n}\right)
+\end{displaymath}
+
+Applying a union bound to have the previous inequality hold for all coordinates
+of $\nabla\mathcal{L}(\theta)$ implies:
+\begin{align*}
+    \P\big[\|\nabla\mathcal{L}(\theta^*)\|_{\infty}\geq \lambda \big]
+    &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right)
+\end{align*}
+Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes
+the proof.
+\end{proof}
+
 \subsubsection{Approximate sparsity proof}
 \subsubsection{RE with high probability}
 
-\subsection{Other continuous time processes binned to ours: prop. hazards model}
+\subsection{Other continuous time processes binned to ours: proportional
+hazards model}
 
 \subsection{Irrepresentability vs. Restricted Eigenvalue Condition}
 In the words and notation of Theorem 9.1 in \cite{vandegeer:2009}:
diff --git a/paper/sections/model.tex b/paper/sections/model.tex
index 4241f93..43de785 100644
--- a/paper/sections/model.tex
+++ b/paper/sections/model.tex
@@ -127,6 +127,22 @@ Defining $\Theta_{i,j} \defeq \log(1-p_{i,j})$, this can be rewritten as:
 Therefore, the independent cascade model is a Generalized Linear Cascade model
 with inverse link function $f : z \mapsto 1 - e^z$.
 
+Note that interpreting it in the case of the Independent Cascade Model requires
+one more step. Indeed, to cast it as a generalized linear cascade model, we had
+to perform the transformation $\Theta_{i,j} = \log(1-p_{i,j})$, where $p_{i,j}$
+are the infection probabilities. Fortunately, if we estimate $p_j$ through
+$\hat{p}_{j} = \hat{\theta}_j$, a bound on $\|\hat\theta - \theta^*\|_2$
+directly implies a bound on $\|\hat p - p^*\|_2$:
+
+\begin{lemma}
+    $\|\hat{\theta} - \theta^* \|_2 \geq \|\hat{p} - p^*\|_2$.
+\end{lemma}
+\begin{proof}
+Using the inequality $\forall x>0, \; \log x \geq 1 - \frac{1}{x}$, we have
+$|\log (1 - p) - \log (1-p')| \geq \max(1 - \frac{1-p}{1-p'},
+1 - \frac{1-p'}{1-p}) \geq \max( p-p', p'-p)$.
+\end{proof}
+
 \subsubsection{The Linear Voter Model}
 
 In the Linear Voter Model, nodes can be either \emph{red} or \emph{blue}.
@@ -150,29 +166,42 @@ with inverse link function $f: z \mapsto z$.
 
 \subsubsection{Discretization of Continous Model}
 
-Consider the continuous-time independent cascade model with exponential
-transmission function (CICE) of~\cite{GomezRodriguez:2010, Abrahao:13,
-Daneshmand:2014} where time is binned into intervals of length $\epsilon$. Let
-$X^k$ be the set of nodes `infected' before or during the $k^{th}$ time
-interval. Note that contrary to the discrete-time independent cascade model,
-$X^k_i = 1 \implies X^{k+1}_i = 1$.  Let $\exp(p)$ be an
-exponentially-distributed random variable of parameter $p$.  By the memoryless
-property of the exponential, if we consider that $\forall i,\Theta_{i,j} = 0 if
-(i,j) \notin E$, then if $X^k_j \neq 1$:
+Anoter motivation for the Generalized Linear Cascade model is that it captures
+the time-discretized formulation of the well-studied  continuous-time
+independent cascade model with exponential transmission function (CICE)
+of~\cite{GomezRodriguez:2010, Abrahao:13, Daneshmand:2014}.  Suppose that time
+is binned into intervals of length $\epsilon$ and let $X^k$ be the set of nodes
+`infected' before or during the $k^{th}$ time interval.  Note that contrary to
+the discrete-time independent cascade model, $X^k_i = 1 \implies X^{k+1}_i =
+1$.  Let $\exp(p)$ be an exponentially-distributed random variable of parameter
+$p$.  By the memoryless property of the exponential, if $X^k_j \neq 1$:
 \begin{align*}
   \mathbb{P}(X^{k+1}_j = 1 | X^k) & = \mathbb{P}(\min_{i \in {\cal N}(j)}
   \exp(p_{i,j}) \leq \epsilon) \\
   & = \mathbb{P}(\exp( \sum_{i=1}^m \Theta_{i,j} X^t_i) \leq \epsilon) \\
   & = 1 - e^{- \epsilon \cdot \theta_j \cdot X^t}
 \end{align*}
-This formulation is consistent when $X^k_j = 1$ by considering the dummy
-variables $\forall i, \Theta_{i,i} = +\infty$.  Therefore, the
-$\epsilon-$binned-process of the continuous-time model with exponential
-transmission function is a Generalized Linear Cascade model with inverse link
-function $f:z\mapsto 1-e^{-\epsilon\cdot z}$.
+Note that here we implicitly let $\Theta_{i,j} = 0$ if $(i,j)$ is not a
+directed edge of the graph. Note that this formulation is also consistent when
+$X^k_j = 1$ by considering the dummy variables $\forall i, \Theta_{i,i} =
++\infty$.  Therefore, the $\epsilon-$binned-process of the continuous-time
+model with exponential transmission function is a Generalized Linear Cascade
+model with inverse link function $f:z\mapsto 1-e^{-\epsilon\cdot z}$.
 
 \subsubsection{Logistic Cascades}
 
+Consider the specific case of ``logistic cascades'' (where $f$ is the logistic
+function). This model captures the idea that there is a threshold after which,
+each additional neighbor's influence contribution, becomes significant. In this
+way, logistic cascades can be thought of approximating the more-commonly
+studied Linear Threshold model TODO: CITE. As we will see later in the
+analysis, the Hessian of our optimization program simplifies to
+$\nabla^2\mathcal{L}(\theta^*) = \frac{1}{|\mathcal{T}|}XX^T$ where $X$ is the
+observation matrix $[x^1 \ldots x^\mathcal{|T|}]$. The (RE)-condition is then
+equivalent to the assumption made in the Lasso analysis of \cite{bickel:2009}.
+
+
+
 % \subsection{The Linear Threshold Model}
 
 % In the Linear Threshold Model, each node $j\in V$ has a threshold $t_j$ from
diff --git a/paper/sections/results.tex b/paper/sections/results.tex
index 54fc587..33bff55 100644
--- a/paper/sections/results.tex
+++ b/paper/sections/results.tex
@@ -92,24 +92,6 @@ $n$, which is different from the number of cascades. For example, in the case
 of the voter model with horizon time $T$ and for $N$ cascades, we can expect
 a number of measurements proportional to $N\times T$.
 
-{\color{red} Move this to the model section}
-Before moving to the proof of Theorem~\ref{thm:main}, note that interpreting it
-in the case of the Independent Cascade Model requires one more step. Indeed, to
-cast it as a generalized linear cascade model, we had to perform the
-transformation $\Theta_{i,j} = \log(1-p_{i,j})$, where $p_{i,j}$ are the
-infection probabilities. Fortunately, if we estimate $p_j$ through $\hat{p}_{j}
-= \hat{\theta}_j$, a bound on $\|\hat\theta - \theta^*\|_2$ directly implies
-a bound on $\|\hat p - p^*\|_2$:
-
-\begin{lemma}
-    $\|\hat{\theta} - \theta^* \|_2 \geq \|\hat{p} - p^*\|_2$.
-\end{lemma}
-\begin{proof}
-Using the inequality $\forall x>0, \; \log x \geq 1 - \frac{1}{x}$, we have
-$|\log (1 - p) - \log (1-p')| \geq \max(1 - \frac{1-p}{1-p'},
-1 - \frac{1-p'}{1-p}) \geq \max( p-p', p'-p)$.
-\end{proof}
-
 Theorem~\ref{thm:main} is a consequence of Theorem~1 in \cite{Negahban:2009}
 which gives a bound on the convergence rate of regularized estimators. We state
 their theorem in the context of $\ell_1$ regularization in
@@ -155,38 +137,7 @@ Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$:
 \end{displaymath}
 \end{lemma}
 
-\begin{proof}
-The gradient of $\mathcal{L}$ is given by:
-\begin{multline*}
-    \nabla \mathcal{L}(\theta^*) =
-    \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
-        x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\
-    - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg]
-\end{multline*}
 
-Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of
-$\nabla\mathcal{L}(\theta^*)$.  Writing
-$\partial_j\mathcal{L}(\theta^*)
-= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
-$\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
-= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.
-
-Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq \frac{1}{\alpha}$
-and we can apply Azuma's inequality to $Z_t$:
-\begin{displaymath}
-    \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
-    2\exp\left(\frac{-\lambda^2\alpha}{2n}\right)
-\end{displaymath}
-
-Applying a union bound to have the previous inequality hold for all coordinates
-of $\nabla\mathcal{L}(\theta)$ implies:
-\begin{align*}
-    \P\big[\|\nabla\mathcal{L}(\theta^*)\|_{\infty}\geq \lambda \big]
-    &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right)
-\end{align*}
-Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes the
-proof.
-\end{proof}
 
 Note how the proof of Lemma~\ref{lem:ub} relied crucially on Azuma-Hoeffding's
 inequality, which allows us to handle correlated observations. We now show how
@@ -203,17 +154,17 @@ i \in \{1,\ldots,m\} :\theta_i^* > \eta +\epsilon \}$ be the set of all true
 \frac{9s\log m}{\alpha\gamma^2\epsilon^2}$.  Then with probability
 $1-\frac{1}{m}$, ${\cal S}^*_{\eta + \epsilon} \subseteq \hat {\cal S}_\eta
 \subseteq {\cal S}^*$. In other words we recover all `strong' parents and no
-`false' parents. 
+`false' parents.
 \end{corollary}
 
 \begin{proof}
-By choosing $\delta = 0$, if $ n > \frac{9s\log
-m}{\alpha\gamma^2\epsilon^2}$, then $\|\hat \theta-\theta^*\|_2 < \epsilon < \eta$
-with probability $1-\frac{1}{m}$. If $\theta_i^* = 0$ and $\hat \theta > \eta$,
-then $\|\hat \theta - \theta^*\|_2 \geq |\hat \theta_i-\theta_i^*| > \eta$, which
-is a contradiction. Therefore we get no false positives. If $\theta_i^* > \eta
-+ \epsilon$, then $|\hat{\theta}_i- \theta_i^*| < \epsilon \implies \theta_j
-> \eta$ and we get all strong parents.
+By choosing $\delta = 0$, if $ n > \frac{9s\log m}{\alpha\gamma^2\epsilon^2}$,
+then $\|\hat \theta-\theta^*\|_2 < \epsilon < \eta$ with probability
+$1-\frac{1}{m}$. If $\theta_i^* = 0$ and $\hat \theta > \eta$, then $\|\hat
+\theta - \theta^*\|_2 \geq |\hat \theta_i-\theta_i^*| > \eta$, which is a
+contradiction. Therefore we get no false positives. If $\theta_i^* > \eta +
+\epsilon$, then $|\hat{\theta}_i- \theta_i^*| < \epsilon \implies \theta_j >
+\eta$ and we get all strong parents.
 \end{proof}
 
 Assuming we know a lower bound $\alpha$ on $\Theta_{i,j}$,
@@ -318,13 +269,6 @@ too collinear with each other.
 {\color{red} if the function is strictly log-convex, then equivalent -> explain
 what the gram matrix is (explanation)}
 
-{\color{red} move to model section, small example}
-In the specific case of ``logistic cascades'' (when $f$ is the logistic
-function), the Hessian simplifies to $\nabla^2\mathcal{L}(\theta^*)
-= \frac{1}{|\mathcal{T}|}XX^T$ where $X$ is the observation matrix $[x^1 \ldots
-x^\mathcal{|T|}]$. The (RE)-condition is then equivalent 
-to the assumption made in the Lasso analysis of \cite{bickel:2009}.
-
 \paragraph{(RE) with high probability}
 
 The Generalized Linear Cascade model yields a probability distribution over the
@@ -332,7 +276,7 @@ observed sets of infeceted nodes $(x^t)_{t\in\mathcal{T}}$. It is then natural
 to ask whether the restricted eigenvalue condition is likely to occur under
 this probabilistic model. Several recent papers show that large classes of
 correlated designs obey the restricted eigenvalue property with high
-probability \cite{raskutti:10, rudelson:13}. 
+probability \cite{raskutti:10, rudelson:13}.
 
 In our case, we can show that if (RE)-condition holds for the expected Hessian
 matrix $\E[\nabla^2\mathcal{L}(\theta^*)]$, then it holds for the finite sample