aboutsummaryrefslogtreecommitdiffstats
path: root/paper
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2015-02-06 15:20:21 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2015-02-06 15:20:21 -0500
commit748e4be47ca0e6ee8dccdeb905834dc0c7ddd6b6 (patch)
treebaed53cddedac59e91cb3db77d945451dfc1450c /paper
parentb47fc034ec5dbe65fe8ceb296254c0963f96128f (diff)
downloadcascades-748e4be47ca0e6ee8dccdeb905834dc0c7ddd6b6.tar.gz
Section 3
Diffstat (limited to 'paper')
-rw-r--r--paper/sections/results.tex90
1 files changed, 61 insertions, 29 deletions
diff --git a/paper/sections/results.tex b/paper/sections/results.tex
index 01a33e9..f32b037 100644
--- a/paper/sections/results.tex
+++ b/paper/sections/results.tex
@@ -224,11 +224,11 @@ exactly $s$-sparse, it can be well approximated by $s$-sparse vectors.
Rather than obtaining an impossibility result, we show that the bounds obtained
in Section~\ref{sec:main_theorem} degrade gracefully in this setting. Formally,
let
-\begin{displaymath}
+$
\theta^*_{\lfloor s \rfloor}
\in \argmin_{\|\theta\|_0 \leq s} \|\theta - \theta^*\|_1
-\end{displaymath}
-be the best $s$-approximation to $\theta^*$, then we pay a cost proportional
+ $
+be the best $s$-approximation to $\theta^*$. Then we pay a cost proportional
to $\|\theta^* - \theta^*_{\lfloor s\rfloor}\|_1$ for recovering the weights of
non-exactly sparse vectors. This cost is simply the ``tail'' of
$\theta^*$: the sum of the $m-s$ smallest coordinates of $\theta^*$.
@@ -280,7 +280,7 @@ irrepresentability condition considered in \cite{Daneshmand:2014}.
\paragraph{Interpretation}
There exists a large class of sufficient conditions under which sparse recovery
-is achievable in the context of regularized estimation. A good survey on these
+is achievable in the context of regularized estimation. A good survey of these
different assumptions can be found in \cite{vandegeer:2009}.
The restricted eigenvalue condition introduce in \cite{bickel:2009} is one of
@@ -318,12 +318,13 @@ probability \cite{raskutti:10, rudelson:13}.
In our case, we can show that if (RE)-condition holds for the expected Hessian
matrix $\E[\nabla^2\mathcal{L}(\theta^*)]$, then it holds for the finite sample
-Hessian matrix $\nabla^2\mathcal{L}(\theta)$ with high probability. Note that
+Hessian matrix $\nabla^2\mathcal{L}(\theta^*)$ with high probability. Note that
the expected Hessian matrix is exactly the Fisher Information matrix of the
generalized linear cascade model which captures the amount of information about
$\theta$ conveyed by the random observations. Therefore, under an assumption
-which only involves the probabilistic model and not the actual observations, we
-can reformulate Theorem~\ref{thm:main}.
+which only involves the probabilistic model and not the actual observations,
+Proposition~\ref{prop:fi} allows us to draw the same conclusion as in
+Theorem~\ref{thm:main}.
We will need the following additional assumptions on the inverse link function $f$:
\begin{equation}
@@ -334,30 +335,58 @@ We will need the following additional assumptions on the inverse link function $
\left|\frac{f''}{f}\right|\right)
\leq\frac{1}{\alpha}
\end{equation}
-whenever $f(\inprod{theta^*}{x})\notin\{0,1\}$.
+whenever $f(\inprod{\theta^*}{x})\notin\{0,1\}$.
-\begin{theorem}
+\begin{proposition}
+ \label{prop:fi}
If $\E[\nabla^2\mathcal{L}(\theta^*)]$ verifies the $(S,\gamma)$-{\bf (RE)}
- condition. Assuming {\bf (LF)} and {\bf (LF2)}, let $\hat\theta$ be
- a solution to \eqref{eq:pre-mle}, then if $n\geq $ we have:
+ condition and assuming {\bf (LF)} and {\bf (LF2)}, then for $\delta> 0$, if $n^{1-\delta}\geq
+\frac{M+2}{21\gamma\alpha}s^2\log m}
+ $, $\nabla^2\mathcal{L}(\theta^*)$ verifies the $(S,\frac{\gamma}{2})$-(RE)
+ condition, w.p $\geq 1-e^{-n^\delta\log m}$.
+\end{proposition}
+
+\begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if
+ $ \forall\Delta\in C(S),\;
+ \|\E[H] - H]\|_\infty\leq \lambda $
+ and $\E[H]$ verifies the $(S,\gamma)$-(RE)
+ condition then:
+ \begin{equation}
+ \label{eq:foo}
+ \forall \Delta\in C(S),\;
+ \Delta H\Delta \geq
+ \Delta \E[H]\Delta(1-32s\lambda/\gamma)
+ \end{equation}
+ Indeed, $
+ |\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq
+ 2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2
+ $.
+ Writing
+ $\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in
+ T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq
+ \frac{4(M+2)}{\alpha}$.
+ Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this
+ implies:
\begin{displaymath}
- \|\hat \theta - \theta^* \|_2 \leq \frac{3}{\gamma} \sqrt{\frac{s \log
- m}{\alpha n^{1-\delta}}} \quad \text{w.p.}\;1-\frac{1}{e^{n^\delta \log m}}
+ \P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq
+ 2\exp\left(-\frac{n\alpha\lambda^2}{4(M+2)} + 2\log m\right)
\end{displaymath}
-\end{theorem}
-
-\begin{proof}
+ Thus, if we take $\lambda=\sqrt{\frac{12(M+2)\log m}{\alpha
+ n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least
+ $1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq
+ \frac{M+2}{21\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies
+ $
+ \forall \Delta\in C(S),\;
+ \Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta,
+ $ w.p. at least $1-e^{-n^{\delta}\log m}$ and the conclusion of
+ Proposition~\ref{prop:fi} follows.
\end{proof}
-
-
-By adapting Theorem
-8 \cite{rudelson:13}, this
- can be reduced to:
-\begin{displaymath}
-n \geq C s \log m \log^3 \left( \frac{s \log m}{C'} \right)
-\end{displaymath}
-where $C, C'$ are constants not depending on $(s, m, n)$.
+Observe that the number of measurements required in Proposition~\ref{prop:fi}
+is now quadratic in $s$. If we only keep the first measurement from each
+cascade which are independent, we can apply Theorem 1.8 from
+\cite{rudelson:13}, lowering the number of required cascades to $s\log(s\log^3
+m)$.
\paragraph{(RE) vs Irrepresentability Condition}
@@ -365,6 +394,7 @@ where $C, C'$ are constants not depending on $(s, m, n)$.
likelihood function. Their condition is equivalent to the more commonly called
{\it (S,s)-irrepresentability} condition:
+\begin{comment}
\begin{definition}
Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and $S^c$ be the set of indices of all the parents and non-parents respectively and $Q_{S,S}$, $Q_{S^c,S}$, $Q_{S, S^c}$, and $Q_{S^c, S^c}$ the induced sub-matrices. Consider the following constant:
\begin{equation}
@@ -372,17 +402,19 @@ Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and $
\end{equation}
The (S,s)-irrepresentability holds if $\nu_{\text{irrepresentable}}(S) < 1 - \epsilon$ for $\epsilon > 0$
\end{definition}
+\end{comment}
It is possible to construct examples where the (RE) condition holds but not the
-irrepresentability condition \cite{vandegeer:2009}. In fact, a slightly
-modified result from \cite{vandegeer:2009} shows that a `strong'
-irrepresentability condition directly {\it implies} the {\bf(RE)} condition for
-$\ell_2$-recovery:
+irrepresentability condition \cite{vandegeer:2009}. Theorem 9.1 from the same
+paper shows that a `strong' irrepresentability condition directly {\it implies}
+the {\bf(RE)} condition for $\ell_2$-recovery.
+\begin{comment}
\begin{proposition}
\label{prop:irrepresentability}
If the irrepresentability condition holds with $\epsilon > \frac{2}{3}$, then the restricted eigenvalue condition holds with constant $\gamma_n \geq \frac{ (1 - 3(1 -\epsilon))^2 \lambda_{\min}^2}{4s}n$, where $\lambda_{\min} > 0$ is the smallest eigenvalue of $Q^*_{S,S}$, on which the results of \cite{Daneshmand:2014} also depend.
\end{proposition}
+\end{comment}
\begin{comment}
Furthermore, recent papers \cite{vandegeer:2011}, \cite{Zou:2006}, argue that