aboutsummaryrefslogtreecommitdiffstats
path: root/paper/sections/appendix.tex
blob: 64953a3d8203b1a6f862fe70a3ce1fa095781d1a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
\subsection{Proof for different lemmas}

\subsubsection{Bounded gradient}

\begin{proof}
The gradient of $\mathcal{L}$ is given by:
\begin{multline*}
    \nabla \mathcal{L}(\theta^*) =
    \frac{1}{|\mathcal{T}|}\sum_{t\in \mathcal{T}}x^t\bigg[
        x_i^{t+1}\frac{f'}{f}(\inprod{\theta^*}{x^t})\\
    - (1-x_i^{t+1})\frac{f'}{1-f}(\inprod{\theta^*}{x^t})\bigg]
\end{multline*}

Let $\partial_j \mathcal{L}(\theta)$ be the $j$-th coordinate of
$\nabla\mathcal{L}(\theta^*)$.  Writing
$\partial_j\mathcal{L}(\theta^*)
= \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}} Y_t$ and since
$\E[x_i^{t+1}|x^t]= f(\inprod{\theta^*}{x^t})$, we have that $\E[Y_{t+1}|Y_t]
= 0$. Hence $Z_t = \sum_{k=1}^t Y_k$ is a martingale.

Using assumption (LF), we have almost surely $|Z_{t+1}-Z_t|\leq
\frac{1}{\alpha}$ and we can apply Azuma's inequality to $Z_t$:
\begin{displaymath}
    \P\big[|Z_{\mathcal{T}}|\geq \lambda\big]\leq
    2\exp\left(\frac{-\lambda^2\alpha}{2n}\right)
\end{displaymath}

Applying a union bound to have the previous inequality hold for all coordinates
of $\nabla\mathcal{L}(\theta)$ implies:
\begin{align*}
    \P\big[\|\nabla\mathcal{L}(\theta^*)\|_{\infty}\geq \lambda \big]
    &\leq 2m\exp\left(\frac{-\lambda^2n\alpha}{2}\right)
\end{align*}
Choosing $\lambda\defeq 2\sqrt{\frac{\log m}{\alpha n^{1-\delta}}}$ concludes
the proof.
\end{proof}

\subsubsection{Approximate sparsity proof}
\subsubsection{RE with high probability}

\begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if
    $ \forall\Delta\in C(S),\;
        \|\E[H] - H]\|_\infty\leq \lambda $
    and $\E[H]$ verifies the $(S,\gamma)$-(RE)
    condition then:
    \begin{equation}
        \label{eq:foo}
    \forall \Delta\in C(S),\;
    \Delta H\Delta \geq
    \Delta \E[H]\Delta(1-32s\lambda/\gamma)
    \end{equation}
    Indeed, $
    |\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq
    2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2
    $.
    Writing
    $\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in
    T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq
    \frac{4(M+2)}{\alpha}$.
    Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this
    implies:
    \begin{displaymath}
        \P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq
        2\exp\left(-\frac{n\alpha\lambda^2}{4(M+2)} + 2\log m\right)
    \end{displaymath}
    Thus, if we take $\lambda=\sqrt{\frac{12(M+2)\log m}{\alpha
    n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least
    $1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq
    \frac{M+2}{21\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies
    $
    \forall \Delta\in C(S),\;
    \Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta,
    $ w.p. at least $1-e^{-n^{\delta}\log m}$ and the conclusion of
    Proposition~\ref{prop:fi} follows.
\end{proof}

\subsection{Lower Bound}

Let us consider an algorithm $\mathcal{A}$ which verifies the recovery
guarantee of Theorem~\ref{thm:lb}: there exists a probability distribution over
measurements such that for all vectors $\theta^*$, \eqref{eq:lb} holds w.p.
$\delta$. This implies by the probabilistic method that for all distribution
$D$ over vectors $\theta$, there exists an $n\times m$ measurement matrix $X_D$
with such that \eqref{eq:lb} holds w.p. $\delta$ ($\theta$ is now the random
variable).

Consider the following distribution $D$: choose $S$ uniformly at random from a
``well-chosen'' set of $s$-sparse supports $\mathcal{F}$ and $t$ uniformly at
random from
$X \defeq\big\{t\in\{-1,0,1\}^m\,|\, \mathrm{supp}(t)\in\mathcal{F}\big\}$.
Define $\theta = t + w$ where
$w\sim\mathcal{N}(0, \alpha\frac{s}{m}I_m)$ and $\alpha = \Omega(\frac{1}{C})$.

Consider the following communication game between Alice and Bob: \emph{(1)}
Alice sends $y\in\reals^m$ drawn from a Bernouilli distribution of parameter
$f(X_D\theta)$ to Bob. \emph{(2)} Bob uses $\mathcal{A}$ to recover
$\hat{\theta}$ from $y$.  It can be shown that at the end of the game Bob now
has a quantity of information $\Omega(s\log \frac{m}{s})$ about $S$. By the
Shannon-Hartley theorem, this information is also upper-bounded by $\O(n\log
C)$. These two bounds together imply the theorem.

\subsection{Other continuous time processes binned to ours: proportional
hazards model}

\subsection{Irrepresentability vs. Restricted Eigenvalue Condition}
In the words and notation of Theorem 9.1 in \cite{vandegeer:2009}:
\begin{lemma}
\label{lemm:irrepresentability_proof}
Let $\phi^2_{\text{compatible}}(L,S) \defeq \min \{ \frac{s
\|f_\beta\|^2_2}{\|\beta_S\|^2_1} \ : \ \beta \in {\cal R}(L, S) \}$, where
$\|f_\beta\|^2_2 \defeq \{ \beta^T \Sigma \beta \}$ and ${\cal R}(L,S) \defeq
\{\beta : \|\beta_{S^c}\|_1 \leq L \|\beta_S\|_1 \neq 0\}$. If
$\nu_{\text{irrepresentable}(S,s)} < 1/L$, then $\phi^2_{\text{compatible}}(L,S)
\geq (1 - L \nu_{\text{irrepresentable}(S,s)})^2 \lambda_{\min}^2$.
\end{lemma}

Since ${\cal R}(3, S) = {\cal C}$, $\|\beta_S\|_1 \geq \|\beta_S\|_2$, and
$\|\beta_S\|_1 \geq \frac{1}{3} \|\beta_{S^c}\|_1$ it is easy to see that
$\|\beta_S\|_1 \geq \frac{1}{4} \|\beta\|_2$ and therefore that: $\gamma_n \geq
\frac{n}{4s}\phi^2_{\text{compatible}}(3,S)$

Consequently, if $\epsilon > \frac{2}{3}$, then
$\nu_{\text{irrepresentable}(S,s)} < 1/3$ and the conditions of
Lemma~\ref{lemm:irrepresentability_proof} hold.



\subsection{Lower bound for restricted eigenvalues (expected hessian) for
different graphs}

\subsection{Better asymptotic w.r.t expected hessian}

\subsection{Confidence intervals?}

\subsection{Active learning}