In this appendix, we provide the missing proofs of Section~\ref{sec:results} and Section~\ref{sec:lowerbound}. We also show additional experiments on the running time of our recovery algorithm which could not fit in the main part of the paper. \subsection{Proofs of Section~\ref{sec:results}} \begin{proof}[Proof of Lemma~\ref{lem:transform}] Using the inequality $\forall x>0, \; \log x \geq 1 - \frac{1}{x}$, we have $|\log (\frac{1}{1 - p}) - \log (\frac{1}{1-p'})| \geq \max(1 - \frac{1-p}{1-p'}, 1 - \frac{1-p'}{1-p}) \geq \max( p-p', p'-p)$. \end{proof} \begin{proof}[Proof of Lemma~\ref{lem:ub}] The gradient of $\mathcal{L}$ is given by: \begin{multline*} \nabla \mathcal{L}(\theta^*) = \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[ x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\ - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg] \end{multline*} Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of $\nabla\mathcal{L}(\theta^*)$. Writing $\partial_j\mathcal{L}(\theta^*) = \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since $\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t] = 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale. Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq \frac{1}{\alpha}$ and we can apply Azuma's inequality to $Z_t$: \begin{displaymath} \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq 2\exp\left(\frac{-\lambda^2\alpha}{2n}\right) \end{displaymath} Applying a union bound to have the previous inequality hold for all coordinates of $\nabla\mathcal{L}(\theta)$ implies: \begin{align*} \P\big[\|\nabla\mathcal{L}(\theta^*)\|_{\infty}\geq \lambda \big] &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right) \end{align*} Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes the proof. \end{proof} \begin{proof}[Proof of Corollary~\ref{cor:variable_selection}] By choosing $\delta = 0$, if $ n > \frac{9s\log m}{\alpha\gamma^2\epsilon^2}$, then $\|\hat \theta-\theta^*\|_2 < \epsilon < \eta$ with probability $1-\frac{1}{m}$. If $\theta_i^* = 0$ and $\hat \theta > \eta$, then $\|\hat \theta - \theta^*\|_2 \geq |\hat \theta_i-\theta_i^*| > \eta$, which is a contradiction. Therefore we get no false positives. If $\theta_i^* > \eta + \epsilon$, then $|\hat{\theta}_i- \theta_i^*| < \epsilon \implies \theta_j > \eta$ and we get all strong parents. \end{proof} %\subsubsection{Approximate sparsity proof} \paragraph{(RE) with high probability} We now prove Proposition~\ref{prop:fi}. The proof mostly relies on showing that the Hessian of likelihood function $\mathcal{L}$ is sufficiently well concentrated around its expectation. \begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if $ \forall\Delta\in C(S),\; \|\E[H] - H]\|_\infty\leq \lambda $ and $\E[H]$ verifies the $(S,\gamma)$-(RE) condition then: \begin{equation} \label{eq:foo} \forall \Delta\in C(S),\; \Delta H\Delta \geq \Delta \E[H]\Delta(1-32s\lambda/\gamma) \end{equation} Indeed, $ |\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq 2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2 $. Writing $\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq \frac{3}{\alpha}$. Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this implies: \begin{displaymath} \P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq 2\exp\left(-\frac{n\alpha\lambda^2}{3} + 2\log m\right) \end{displaymath} Thus, if we take $\lambda=\sqrt{\frac{9log m}{\alpha n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least $1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq \frac{1}{28\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies $ \forall \Delta\in C(S),\; \Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta, $ w.p.~at least $1-e^{-n^{\delta}\log m}$ and the conclusion of Proposition~\ref{prop:fi} follows. \end{proof} \subsection{Proof of Theorem~\ref{thm:lb}} Let us consider an algorithm $\mathcal{A}$ which verifies the recovery guarantee of Theorem~\ref{thm:lb}: there exists a probability distribution over measurements such that for all vectors $\theta^*$, \eqref{eq:lb} holds w.p. $\delta$. This implies by the probabilistic method that for all distribution $D$ over vectors $\theta$, there exists an $n\times m$ measurement matrix $X_D$ with such that \eqref{eq:lb} holds w.p. $\delta$ ($\theta$ is now the random variable). Consider the following distribution $D$: choose $S$ uniformly at random from a ``well-chosen'' set of $s$-sparse supports $\mathcal{F}$ and $t$ uniformly at random from $X \defeq\big\{t\in\{-1,0,1\}^m\,|\, \mathrm{supp}(t)\in\mathcal{F}\big\}$. Define $\theta = t + w$ where $w\sim\mathcal{N}(0, \alpha\frac{s}{m}I_m)$ and $\alpha = \Omega(\frac{1}{C})$. Consider the following communication game between Alice and Bob: \emph{(1)} Alice sends $y\in\reals^m$ drawn from a Bernouilli distribution of parameter $f(X_D\theta)$ to Bob. \emph{(2)} Bob uses $\mathcal{A}$ to recover $\hat{\theta}$ from $y$. It can be shown that at the end of the game Bob now has a quantity of information $\Omega(s\log \frac{m}{s})$ about $S$. By the Shannon-Hartley theorem, this information is also upper-bounded by $\O(n\log C)$. These two bounds together imply the theorem. \subsection{Running Time Analysis} \begin{figure} \centering \includegraphics[scale=.4]{figures/n_nodes_running_time.pdf} \vspace{-1em} \caption{Running time analysis for estimating the parents of a \emph{single node} on a Barabasi-Albert graph as a function of the number of nodes in the graph. The parameter $k$ (number of nodes each new node is attached to) was set to $30$. $p_{\text{init}}$ is chosen equal to $.15$, and the edge weights are chosen uniformly at random in $[.2,.7]$. The penalization parameter $\lambda$ is chosen equal to $.1$.} \label{fig:running_time_n_nodes} \vspace{-1em} \end{figure} \begin{figure} \centering \includegraphics[scale=.4]{figures/n_cascades_running_time.pdf} \caption{Running time analysis for estimating the parents of a \emph{single node} on a Barabasi-Albert graph as a function of the number of total observed cascades. The parameters defining the graph were set as in Figure~\ref{fig:running_time_n_nodes}.} \label{fig:running_time_n_cascades} \vspace{-1em} \end{figure} We include here a running time analysis of our algorithm. In Figure~\ref{fig:running_time_n_nodes}, we compared our algorithm to the benchmark algorithms for increasing values of the number of nodes. In Figure~\ref{fig:running_time_n_cascades}, we compared our algorithm to the benchmarks for a fixed graph but for increasing number of observed cascades. In both Figures, unsurprisingly, the simple greedy algorithm is the fastest. Even though both the MLE algorithm and the algorithm we introduced are based on convex optimization, the MLE algorithm is faster. This is due to the overhead caused by the $\ell_1$-regularisation in~\eqref{eq:pre-mle}. The dependency of the running time on the number of cascades increases is linear, as expected. The slope is largest for our algorithm, which is again caused by the overhead induced by the $\ell_1$-regularization. %\subsection{Other continuous time processes binned to ours: proportional %hazards model} %\subsection{Irrepresentability vs. Restricted Eigenvalue Condition} %In the words and notation of Theorem 9.1 in \cite{vandegeer:2009}: %\begin{lemma} %\label{lemm:irrepresentability_proof} %Let $\phi^2_{\text{compatible}}(L,S) \defeq \min \{ \frac{s %\|f_\beta\|^2_2}{\|\beta_S\|^2_1} \ : \ \beta \in {\cal R}(L, S) \}$, where %$\|f_\beta\|^2_2 \defeq \{ \beta^T \Sigma \beta \}$ and ${\cal R}(L,S) \defeq %\{\beta : \|\beta_{S^c}\|_1 \leq L \|\beta_S\|_1 \neq 0\}$. If %$\nu_{\text{irrepresentable}(S,s)} < 1/L$, then $\phi^2_{\text{compatible}}(L,S) %\geq (1 - L \nu_{\text{irrepresentable}(S,s)})^2 \lambda_{\min}^2$. %\end{lemma} %Since ${\cal R}(3, S) = {\cal C}$, $\|\beta_S\|_1 \geq \|\beta_S\|_2$, and %$\|\beta_S\|_1 \geq \frac{1}{3} \|\beta_{S^c}\|_1$ it is easy to see that %$\|\beta_S\|_1 \geq \frac{1}{4} \|\beta\|_2$ and therefore that: $\gamma_n \geq %\frac{n}{4s}\phi^2_{\text{compatible}}(3,S)$ %Consequently, if $\epsilon > \frac{2}{3}$, then %$\nu_{\text{irrepresentable}(S,s)} < 1/3$ and the conditions of %Lemma~\ref{lemm:irrepresentability_proof} hold. %\subsection{Lower bound for restricted eigenvalues (expected hessian) for %different graphs} %\subsection{Better asymptotic w.r.t expected hessian} %\subsection{Confidence intervals?} %\subsection{Active learning}