paper/sections/appendix.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169

\subsection{Proof for different lemmas}

\subsubsection{Bounded gradient}

\begin{proof}
The gradient of $\mathcal{L}$ is given by:
\begin{multline*}
    \nabla \mathcal{L}(\theta^*) =
    \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
        x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\
    - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg]
\end{multline*}

Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of
$\nabla\mathcal{L}(\theta^*)$.  Writing
$\partial_j\mathcal{L}(\theta^*)
= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
$\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.

Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq
\frac{1}{\alpha}$ and we can apply Azuma's inequality to $Z_t$:
\begin{displaymath}
    \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
    2\exp\left(\frac{-\lambda^2\alpha}{2n}\right)
\end{displaymath}

Applying a union bound to have the previous inequality hold for all coordinates
of $\nabla\mathcal{L}(\theta)$ implies:
\begin{align*}
    \P\big[\|\nabla\mathcal{L}(\theta^*)\|_{\infty}\geq \lambda \big]
    &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right)
\end{align*}
Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes
the proof.
\end{proof}

%\subsubsection{Approximate sparsity proof}
\subsubsection{RE with high probability}

\begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if
    $ \forall\Delta\in C(S),\;
        \|\E[H] - H]\|_\infty\leq \lambda $
    and $\E[H]$ verifies the $(S,\gamma)$-(RE)
    condition then:
    \begin{equation}
        \label{eq:foo}
    \forall \Delta\in C(S),\;
    \Delta H\Delta \geq
    \Delta \E[H]\Delta(1-32s\lambda/\gamma)
    \end{equation}
    Indeed, $
    |\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq
    2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2
    $.
    Writing
    $\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in
    T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq
    \frac{4(M+2)}{\alpha}$.
    Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this
    implies:
    \begin{displaymath}
        \P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq
        2\exp\left(-\frac{n\alpha\lambda^2}{4(M+2)} + 2\log m\right)
    \end{displaymath}
    Thus, if we take $\lambda=\sqrt{\frac{12(M+2)\log m}{\alpha
    n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least
    $1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq
    \frac{M+2}{21\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies
    $
    \forall \Delta\in C(S),\;
    \Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta,
    $ w.p.~at least $1-e^{-n^{\delta}\log m}$ and the conclusion of
    Proposition~\ref{prop:fi} follows.
\end{proof}

\subsection{Lower Bound}

Let us consider an algorithm $\mathcal{A}$ which verifies the recovery
guarantee of Theorem~\ref{thm:lb}: there exists a probability distribution over
measurements such that for all vectors $\theta^*$, \eqref{eq:lb} holds w.p.
$\delta$. This implies by the probabilistic method that for all distribution
$D$ over vectors $\theta$, there exists an $n\times m$ measurement matrix $X_D$
with such that \eqref{eq:lb} holds w.p. $\delta$ ($\theta$ is now the random
variable).

Consider the following distribution $D$: choose $S$ uniformly at random from a
``well-chosen'' set of $s$-sparse supports $\mathcal{F}$ and $t$ uniformly at
random from
$X \defeq\big\{t\in\{-1,0,1\}^m\,|\, \mathrm{supp}(t)\in\mathcal{F}\big\}$.
Define $\theta = t + w$ where
$w\sim\mathcal{N}(0, \alpha\frac{s}{m}I_m)$ and $\alpha = \Omega(\frac{1}{C})$.

Consider the following communication game between Alice and Bob: \emph{(1)}
Alice sends $y\in\reals^m$ drawn from a Bernouilli distribution of parameter
$f(X_D\theta)$ to Bob. \emph{(2)} Bob uses $\mathcal{A}$ to recover
$\hat{\theta}$ from $y$.  It can be shown that at the end of the game Bob now
has a quantity of information $\Omega(s\log \frac{m}{s})$ about $S$. By the
Shannon-Hartley theorem, this information is also upper-bounded by $\O(n\log
C)$. These two bounds together imply the theorem.


\subsection{Running Time Analysis}

\begin{figure}
  \centering
  \includegraphics[scale=.4]{figures/n_nodes_running_time.pdf}
  \caption{Running time analysis for estimating the parents of a \emph{single
    node} on a Barabasi-Albert graph as a function of the number of nodes in
    the graph. The parameter $k$ (number of nodes each new node is attached to)
    is constant equal to $30$.  $p_{\text{init}}$ is chosen equal to $.15$, and
    the edge weights are chosen uniformly at random in $[.2,.7]$. The
  penalization parameter $\lambda$ is chosen equal to $.1$.}
    \label{fig:running_time_n_nodes}
\end{figure}

\begin{figure}
  \centering
  \includegraphics[scale=.4]{figures/n_cascades_running_time.pdf}
  \caption{Running time analysis for estimating the parents of a \emph{single
    node} on a Barabasi-Albert graph as a function of the number of total
    observed cascades.  The parameter $k$ (number of nodes each new node is
    attached to) is constant equal to $30$.  $p_{\text{init}}$ is chosen equal
  to $.15$, and the edge weights are chosen uniformly at random in $[.2,.7]$.
The penalization parameter $\lambda$ is chosen equal to $.1$.}
    \label{fig:running_time_n_cascades}
\end{figure}

We include here a running time analysis of the different algorithms, as shown
in Figure~\ref{fig:running_time_n_nodes} and
Figure~\ref{fig:running_time_n_cascades}. As expected, the simple greedy
algorithm is the fastest algorithm. Interestingly, the non-penalized convex
optimization program also converges faster than our suggested penalized
maximum-likelihood program.

%\subsection{Other continuous time processes binned to ours: proportional
%hazards model}

%\subsection{Irrepresentability vs. Restricted Eigenvalue Condition}
%In the words and notation of Theorem 9.1 in \cite{vandegeer:2009}:
%\begin{lemma}
%\label{lemm:irrepresentability_proof}
%Let $\phi^2_{\text{compatible}}(L,S) \defeq \min \{ \frac{s
%\|f_\beta\|^2_2}{\|\beta_S\|^2_1} \ : \ \beta \in {\cal R}(L, S) \}$, where
%$\|f_\beta\|^2_2 \defeq \{ \beta^T \Sigma \beta \}$ and ${\cal R}(L,S) \defeq
%\{\beta : \|\beta_{S^c}\|_1 \leq L \|\beta_S\|_1 \neq 0\}$. If
%$\nu_{\text{irrepresentable}(S,s)} < 1/L$, then $\phi^2_{\text{compatible}}(L,S)
%\geq (1 - L \nu_{\text{irrepresentable}(S,s)})^2 \lambda_{\min}^2$.
%\end{lemma}

%Since ${\cal R}(3, S) = {\cal C}$, $\|\beta_S\|_1 \geq \|\beta_S\|_2$, and
%$\|\beta_S\|_1 \geq \frac{1}{3} \|\beta_{S^c}\|_1$ it is easy to see that
%$\|\beta_S\|_1 \geq \frac{1}{4} \|\beta\|_2$ and therefore that: $\gamma_n \geq
%\frac{n}{4s}\phi^2_{\text{compatible}}(3,S)$

%Consequently, if $\epsilon > \frac{2}{3}$, then
%$\nu_{\text{irrepresentable}(S,s)} < 1/3$ and the conditions of
%Lemma~\ref{lemm:irrepresentability_proof} hold.


%\subsection{Lower bound for restricted eigenvalues (expected hessian) for
%different graphs}

%\subsection{Better asymptotic w.r.t expected hessian}

%\subsection{Confidence intervals?}

%\subsection{Active learning}