\subsection{Proof for different lemmas} \subsubsection{Bounded gradient} \begin{proof} The gradient of $\mathcal{L}$ is given by: \begin{multline*} \nabla \mathcal{L}(\theta^*) = \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[ x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\ - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg] \end{multline*} Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of $\nabla\mathcal{L}(\theta^*)$. Writing $\partial_j\mathcal{L}(\theta^*) = \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since $\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t] = 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale. Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq \frac{1}{\alpha}$ and we can apply Azuma's inequality to $Z_t$: \begin{displaymath} \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq 2\exp\left(\frac{-\lambda^2\alpha}{2n}\right) \end{displaymath} Applying a union bound to have the previous inequality hold for all coordinates of $\nabla\mathcal{L}(\theta)$ implies: \begin{align*} \P\big[\|\nabla\mathcal{L}(\theta^*)\|_{\infty}\geq \lambda \big] &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right) \end{align*} Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes the proof. \end{proof} %\subsubsection{Approximate sparsity proof} \subsubsection{RE with high probability} \begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if $ \forall\Delta\in C(S),\; \|\E[H] - H]\|_\infty\leq \lambda $ and $\E[H]$ verifies the $(S,\gamma)$-(RE) condition then: \begin{equation} \label{eq:foo} \forall \Delta\in C(S),\; \Delta H\Delta \geq \Delta \E[H]\Delta(1-32s\lambda/\gamma) \end{equation} Indeed, $ |\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq 2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2 $. Writing $\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq \frac{4(M+2)}{\alpha}$. Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this implies: \begin{displaymath} \P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq 2\exp\left(-\frac{n\alpha\lambda^2}{4(M+2)} + 2\log m\right) \end{displaymath} Thus, if we take $\lambda=\sqrt{\frac{12(M+2)\log m}{\alpha n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least $1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq \frac{M+2}{21\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies $ \forall \Delta\in C(S),\; \Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta, $ w.p.~at least $1-e^{-n^{\delta}\log m}$ and the conclusion of Proposition~\ref{prop:fi} follows. \end{proof} \subsection{Lower Bound} Let us consider an algorithm $\mathcal{A}$ which verifies the recovery guarantee of Theorem~\ref{thm:lb}: there exists a probability distribution over measurements such that for all vectors $\theta^*$, \eqref{eq:lb} holds w.p. $\delta$. This implies by the probabilistic method that for all distribution $D$ over vectors $\theta$, there exists an $n\times m$ measurement matrix $X_D$ with such that \eqref{eq:lb} holds w.p. $\delta$ ($\theta$ is now the random variable). Consider the following distribution $D$: choose $S$ uniformly at random from a ``well-chosen'' set of $s$-sparse supports $\mathcal{F}$ and $t$ uniformly at random from $X \defeq\big\{t\in\{-1,0,1\}^m\,|\, \mathrm{supp}(t)\in\mathcal{F}\big\}$. Define $\theta = t + w$ where $w\sim\mathcal{N}(0, \alpha\frac{s}{m}I_m)$ and $\alpha = \Omega(\frac{1}{C})$. Consider the following communication game between Alice and Bob: \emph{(1)} Alice sends $y\in\reals^m$ drawn from a Bernouilli distribution of parameter $f(X_D\theta)$ to Bob. \emph{(2)} Bob uses $\mathcal{A}$ to recover $\hat{\theta}$ from $y$. It can be shown that at the end of the game Bob now has a quantity of information $\Omega(s\log \frac{m}{s})$ about $S$. By the Shannon-Hartley theorem, this information is also upper-bounded by $\O(n\log C)$. These two bounds together imply the theorem. \subsection{Running Time Analysis} \begin{figure} \centering \includegraphics[scale=.4]{figures/n_nodes_running_time.pdf} \caption{Running time analysis for estimating the parents of a \emph{single node} on a Barabasi-Albert graph as a function of the number of nodes in the graph. The parameter $k$ (number of nodes each new node is attached to) is constant equal to $30$. $p_{\text{init}}$ is chosen equal to $.15$, and the edge weights are chosen uniformly at random in $[.2,.7]$. The penalization parameter $\lambda$ is chosen equal to $.1$.} \label{fig:running_time_n_nodes} \end{figure} \begin{figure} \centering \includegraphics[scale=.4]{figures/n_cascades_running_time.pdf} \caption{Running time analysis for estimating the parents of a \emph{single node} on a Barabasi-Albert graph as a function of the number of total observed cascades. The parameter $k$ (number of nodes each new node is attached to) is constant equal to $30$. $p_{\text{init}}$ is chosen equal to $.15$, and the edge weights are chosen uniformly at random in $[.2,.7]$. The penalization parameter $\lambda$ is chosen equal to $.1$.} \label{fig:running_time_n_cascades} \end{figure} We include here a running time analysis of the different algorithms, as shown in Figure~\ref{fig:running_time_n_nodes} and Figure~\ref{fig:running_time_n_cascades}. As expected, the simple greedy algorithm is the fastest algorithm. Interestingly, the non-penalized convex optimization program also converges faster than our suggested penalized maximum-likelihood program. %\subsection{Other continuous time processes binned to ours: proportional %hazards model} %\subsection{Irrepresentability vs. Restricted Eigenvalue Condition} %In the words and notation of Theorem 9.1 in \cite{vandegeer:2009}: %\begin{lemma} %\label{lemm:irrepresentability_proof} %Let $\phi^2_{\text{compatible}}(L,S) \defeq \min \{ \frac{s %\|f_\beta\|^2_2}{\|\beta_S\|^2_1} \ : \ \beta \in {\cal R}(L, S) \}$, where %$\|f_\beta\|^2_2 \defeq \{ \beta^T \Sigma \beta \}$ and ${\cal R}(L,S) \defeq %\{\beta : \|\beta_{S^c}\|_1 \leq L \|\beta_S\|_1 \neq 0\}$. If %$\nu_{\text{irrepresentable}(S,s)} < 1/L$, then $\phi^2_{\text{compatible}}(L,S) %\geq (1 - L \nu_{\text{irrepresentable}(S,s)})^2 \lambda_{\min}^2$. %\end{lemma} %Since ${\cal R}(3, S) = {\cal C}$, $\|\beta_S\|_1 \geq \|\beta_S\|_2$, and %$\|\beta_S\|_1 \geq \frac{1}{3} \|\beta_{S^c}\|_1$ it is easy to see that %$\|\beta_S\|_1 \geq \frac{1}{4} \|\beta\|_2$ and therefore that: $\gamma_n \geq %\frac{n}{4s}\phi^2_{\text{compatible}}(3,S)$ %Consequently, if $\epsilon > \frac{2}{3}$, then %$\nu_{\text{irrepresentable}(S,s)} < 1/3$ and the conditions of %Lemma~\ref{lemm:irrepresentability_proof} hold. %\subsection{Lower bound for restricted eigenvalues (expected hessian) for %different graphs} %\subsection{Better asymptotic w.r.t expected hessian} %\subsection{Confidence intervals?} %\subsection{Active learning}