In this appendix, we provide the missing proofs of Section~\ref{sec:results}
and Section~\ref{sec:lowerbound}. We also show additional experiments on the
running time of our recovery algorithm which could not fit in the main part of
the paper.

\subsection{Proofs of Section~\ref{sec:results}}

\begin{proof}[Proof of Lemma~\ref{lem:transform}]
Using the inequality $\forall x>0, \; \log x \geq 1 - \frac{1}{x}$, we have
$|\log (\frac{1}{1 - p}) - \log (\frac{1}{1-p'})| \geq \max(1 - \frac{1-p}{1-p'},
1 - \frac{1-p'}{1-p}) \geq \max( p-p', p'-p)$.
\end{proof}

\begin{proof}[Proof of Lemma~\ref{lem:ub}]
The gradient of $\mathcal{L}$ is given by:
\begin{multline*}
    \nabla \mathcal{L}(\theta^*) =
    \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
        x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\
    - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg]
\end{multline*}

Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of
$\nabla\mathcal{L}(\theta^*)$.  Writing
$\partial_j\mathcal{L}(\theta^*)
= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
$\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.

Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq
\frac{1}{\alpha}$ and we can apply Azuma's inequality to $Z_t$:
\begin{displaymath}
    \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
    2\exp\left(\frac{-\lambda^2\alpha}{2n}\right)
\end{displaymath}

Applying a union bound to have the previous inequality hold for all coordinates
of $\nabla\mathcal{L}(\theta)$ implies:
\begin{align*}
    \P\big[\|\nabla\mathcal{L}(\theta^*)\|_{\infty}\geq \lambda \big]
    &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right)
\end{align*}
Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes
the proof.
\end{proof}

\begin{proof}[Proof of Corollary~\ref{cor:variable_selection}]
By choosing $\delta = 0$, if $ n > \frac{9s\log m}{\alpha\gamma^2\epsilon^2}$,
then $\|\hat \theta-\theta^*\|_2 < \epsilon < \eta$ with probability
$1-\frac{1}{m}$. If $\theta_i^* = 0$ and $\hat \theta > \eta$, then $\|\hat
\theta - \theta^*\|_2 \geq |\hat \theta_i-\theta_i^*| > \eta$, which is a
contradiction. Therefore we get no false positives. If $\theta_i^* > \eta +
\epsilon$, then $|\hat{\theta}_i- \theta_i^*| < \epsilon \implies \theta_j >
\eta$ and we get all strong parents.
\end{proof}

%\subsubsection{Approximate sparsity proof}
\paragraph{(RE) with high probability} We now prove Proposition~\ref{prop:fi}.
The proof mostly relies on showing that the Hessian of likelihood function
$\mathcal{L}$ is sufficiently well concentrated around its expectation.

\begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if
    $ \forall\Delta\in C(S),\;
        \|\E[H] - H]\|_\infty\leq \lambda $
    and $\E[H]$ verifies the $(S,\gamma)$-(RE)
    condition then:
    \begin{equation}
        \label{eq:foo}
    \forall \Delta\in C(S),\;
    \Delta H\Delta \geq
    \Delta \E[H]\Delta(1-32s\lambda/\gamma)
    \end{equation}
    Indeed, $
    |\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq
    2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2
    $.
    Writing
    $\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in
    T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq
    \frac{3}{\alpha}$.
    Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this
    implies:
    \begin{displaymath}
        \P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq
        2\exp\left(-\frac{n\alpha\lambda^2}{3} + 2\log m\right)
    \end{displaymath}
    Thus, if we take $\lambda=\sqrt{\frac{9log m}{\alpha
    n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least
    $1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq
    \frac{1}{28\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies
    $
    \forall \Delta\in C(S),\;
    \Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta,
    $ w.p.~at least $1-e^{-n^{\delta}\log m}$ and the conclusion of
    Proposition~\ref{prop:fi} follows.
\end{proof}

\subsection{Proof of Theorem~\ref{thm:lb}}

Let us consider an algorithm $\mathcal{A}$ which verifies the recovery
guarantee of Theorem~\ref{thm:lb}: there exists a probability distribution over
measurements such that for all vectors $\theta^*$, \eqref{eq:lb} holds w.p.
$\delta$. This implies by the probabilistic method that for all distribution
$D$ over vectors $\theta$, there exists an $n\times m$ measurement matrix $X_D$
with such that \eqref{eq:lb} holds w.p. $\delta$ ($\theta$ is now the random
variable).

Consider the following distribution $D$: choose $S$ uniformly at random from a
``well-chosen'' set of $s$-sparse supports $\mathcal{F}$ and $t$ uniformly at
random from
$X \defeq\big\{t\in\{-1,0,1\}^m\,|\, \mathrm{supp}(t)\in\mathcal{F}\big\}$.
Define $\theta = t + w$ where
$w\sim\mathcal{N}(0, \alpha\frac{s}{m}I_m)$ and $\alpha = \Omega(\frac{1}{C})$.

Consider the following communication game between Alice and Bob: \emph{(1)}
Alice sends $y\in\reals^m$ drawn from a Bernouilli distribution of parameter
$f(X_D\theta)$ to Bob. \emph{(2)} Bob uses $\mathcal{A}$ to recover
$\hat{\theta}$ from $y$.  It can be shown that at the end of the game Bob now
has a quantity of information $\Omega(s\log \frac{m}{s})$ about $S$. By the
Shannon-Hartley theorem, this information is also upper-bounded by $\O(n\log
C)$. These two bounds together imply the theorem.

\subsection{Running Time Analysis}

\begin{figure}
  \centering
  \includegraphics[scale=.4]{figures/n_nodes_running_time.pdf}
  \vspace{-1em}
  \caption{Running time analysis for estimating the parents of a \emph{single
    node} on a Barabasi-Albert graph as a function of the number of nodes in
    the graph. The parameter $k$ (number of nodes each new node is attached to)
    was set to $30$.  $p_{\text{init}}$ is chosen equal to $.15$, and
    the edge weights are chosen uniformly at random in $[.2,.7]$. The
  penalization parameter $\lambda$ is chosen equal to $.1$.}
    \label{fig:running_time_n_nodes}
    \vspace{-1em}
\end{figure}

\begin{figure}
  \centering
  \includegraphics[scale=.4]{figures/n_cascades_running_time.pdf}
  \caption{Running time analysis for estimating the parents of a \emph{single
    node} on a Barabasi-Albert graph as a function of the number of total
    observed cascades. The parameters defining the graph were set as in
Figure~\ref{fig:running_time_n_nodes}.}
    \label{fig:running_time_n_cascades}
    \vspace{-1em}
\end{figure}

We include here a running time analysis of our algorithm. In
Figure~\ref{fig:running_time_n_nodes}, we compared our algorithm to the
benchmark algorithms for increasing values of the number of nodes. In
Figure~\ref{fig:running_time_n_cascades}, we compared our algorithm to the
benchmarks for a fixed graph but for increasing number of observed cascades.

In both Figures, unsurprisingly, the simple greedy algorithm is the fastest.
Even though both the MLE algorithm and the algorithm we introduced are based on
convex optimization, the MLE algorithm is faster. This is due to the overhead
caused by the $\ell_1$-regularisation in~\eqref{eq:pre-mle}.

The dependency of the running time on the number of cascades increases is
linear, as expected. The slope is largest for our algorithm, which is again
caused by the overhead induced by the $\ell_1$-regularization.


%\subsection{Other continuous time processes binned to ours: proportional
%hazards model}

%\subsection{Irrepresentability vs. Restricted Eigenvalue Condition}
%In the words and notation of Theorem 9.1 in \cite{vandegeer:2009}:
%\begin{lemma}
%\label{lemm:irrepresentability_proof}
%Let $\phi^2_{\text{compatible}}(L,S) \defeq \min \{ \frac{s
%\|f_\beta\|^2_2}{\|\beta_S\|^2_1} \ : \ \beta \in {\cal R}(L, S) \}$, where
%$\|f_\beta\|^2_2 \defeq \{ \beta^T \Sigma \beta \}$ and ${\cal R}(L,S) \defeq
%\{\beta : \|\beta_{S^c}\|_1 \leq L \|\beta_S\|_1 \neq 0\}$. If
%$\nu_{\text{irrepresentable}(S,s)} < 1/L$, then $\phi^2_{\text{compatible}}(L,S)
%\geq (1 - L \nu_{\text{irrepresentable}(S,s)})^2 \lambda_{\min}^2$.
%\end{lemma}

%Since ${\cal R}(3, S) = {\cal C}$, $\|\beta_S\|_1 \geq \|\beta_S\|_2$, and
%$\|\beta_S\|_1 \geq \frac{1}{3} \|\beta_{S^c}\|_1$ it is easy to see that
%$\|\beta_S\|_1 \geq \frac{1}{4} \|\beta\|_2$ and therefore that: $\gamma_n \geq
%\frac{n}{4s}\phi^2_{\text{compatible}}(3,S)$

%Consequently, if $\epsilon > \frac{2}{3}$, then
%$\nu_{\text{irrepresentable}(S,s)} < 1/3$ and the conditions of
%Lemma~\ref{lemm:irrepresentability_proof} hold.


%\subsection{Lower bound for restricted eigenvalues (expected hessian) for
%different graphs}

%\subsection{Better asymptotic w.r.t expected hessian}

%\subsection{Confidence intervals?}

%\subsection{Active learning}