diff options
Diffstat (limited to 'paper')
| -rw-r--r-- | paper/paper.tex | 19 | ||||
| -rw-r--r-- | paper/sections/appendix.tex | 36 | ||||
| -rw-r--r-- | paper/sections/model.tex | 15 | ||||
| -rw-r--r-- | paper/sections/results.tex | 265 | ||||
| -rw-r--r-- | paper/sparse.bib | 53 |
5 files changed, 206 insertions, 182 deletions
diff --git a/paper/paper.tex b/paper/paper.tex index 5311bec..e7a40a6 100644 --- a/paper/paper.tex +++ b/paper/paper.tex @@ -71,6 +71,7 @@ \newtheorem{remark}{Remark} \newtheorem{proposition}{Proposition} \newtheorem{definition}{Definition} +\newtheorem{fact}{Fact} \begin{document} @@ -81,12 +82,10 @@ % submissions: the style file will automatically remove it for you % unless you've provided the [accepted] option to the icml2015 % package. -\icmlauthor{Your Name}{email@yourdomain.edu} -\icmladdress{Your Fantastic Institute, - 314159 Pi St., Palo Alto, CA 94306 USA} -\icmlauthor{Your CoAuthor's Name}{email@coauthordomain.edu} -\icmladdress{Their Fantastic Institute, - 27182 Exp St., Toronto, ON M6H 2T1 CANADA} +\icmlauthor{Jean Pouget-Abadie}{jeanpougetabadie@g.harvard.edu} +\icmladdress{Harvard University} +\icmlauthor{Thibaut Horel}{thorel@seas.harvard.edu} +\icmladdress{Harvard University} % You may provide any keywords that you % find helpful for describing your paper; these are used to populate @@ -115,14 +114,6 @@ \label{sec:lowerbound} \input{sections/lowerbound.tex} -%\section{Assumptions} -%\label{sec:assumptions} -%\input{sections/assumptions.tex} - -% \section{Linear Threshold Model} -% \label{sec:linear_threshold} -% \input{sections/linear_threshold.tex} - \section{Experiments} \label{sec:experiments} \input{sections/experiments.tex} diff --git a/paper/sections/appendix.tex b/paper/sections/appendix.tex index afcf186..79baf2f 100644 --- a/paper/sections/appendix.tex +++ b/paper/sections/appendix.tex @@ -38,6 +38,42 @@ the proof. \subsubsection{Approximate sparsity proof} \subsubsection{RE with high probability} +\begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if + $ \forall\Delta\in C(S),\; + \|\E[H] - H]\|_\infty\leq \lambda $ + and $\E[H]$ verifies the $(S,\gamma)$-(RE) + condition then: + \begin{equation} + \label{eq:foo} + \forall \Delta\in C(S),\; + \Delta H\Delta \geq + \Delta \E[H]\Delta(1-32s\lambda/\gamma) + \end{equation} + Indeed, $ + |\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq + 2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2 + $. + Writing + $\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in + T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq + \frac{4(M+2)}{\alpha}$. + Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this + implies: + \begin{displaymath} + \P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq + 2\exp\left(-\frac{n\alpha\lambda^2}{4(M+2)} + 2\log m\right) + \end{displaymath} + Thus, if we take $\lambda=\sqrt{\frac{12(M+2)\log m}{\alpha + n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least + $1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq + \frac{M+2}{21\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies + $ + \forall \Delta\in C(S),\; + \Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta, + $ w.p. at least $1-e^{-n^{\delta}\log m}$ and the conclusion of + Proposition~\ref{prop:fi} follows. +\end{proof} + \subsection{Other continuous time processes binned to ours: proportional hazards model} diff --git a/paper/sections/model.tex b/paper/sections/model.tex index b478e94..acec974 100644 --- a/paper/sections/model.tex +++ b/paper/sections/model.tex @@ -192,7 +192,7 @@ Generalized Linear Cascade model with inverse link function $f:z\mapsto 1-e^{-\epsilon\cdot z}$. \subsubsection{Logistic Cascades} - +\label{sec:logistic_cascades} Consider the specific case of ``logistic cascades'' (where $f$ is the logistic function). This model captures the idea that there is a threshold around which each additional neighbor's contribution becomes significant: logistic @@ -201,10 +201,7 @@ Threshold model~\cite{Kempe:03}. As we will see later in the analysis, the Hessian of our optimization program simplifies in the case of a logistic inverse link function to $\nabla^2\mathcal{L}(\theta^*) = \frac{1}{|\mathcal{T}|}XX^T$ where $X$ is the observation matrix $[x^1 \ldots -x^\mathcal{|T|}]$. The (RE)-condition we introduce subsequently is then -equivalent to the assumption made in the Lasso analysis of~\cite{bickel:2009}. - - +x^\mathcal{|T|}]$. % \subsection{The Linear Threshold Model} @@ -234,10 +231,10 @@ equivalent to the assumption made in the Lasso analysis of~\cite{bickel:2009}. \begin{figure} \includegraphics[scale=.4]{figures/drawing.pdf} - \caption{Illustration of the sparse-recovery approach: the measurement matrix - is known, as is a Bernoulli realization of the matrix-vector product via the -non-linear transformation induced by $f$. Our objective is to recover the -unknown weight vector $\theta_i$ for each node $i$.} + \caption{Illustration of the sparse-recovery approach. Our objective is to + recover the unknown weight vector $\theta_j$ for each node $j$. We observe a +Bernoulli realization of the $f$ transform of the matrix-vector product, where +the measurement matrix encodes which nodes are ``contagious''} \end{figure} diff --git a/paper/sections/results.tex b/paper/sections/results.tex index 33bff55..a0d3159 100644 --- a/paper/sections/results.tex +++ b/paper/sections/results.tex @@ -3,7 +3,7 @@ assumptions our program~\eqref{eq:pre-mle} recovers the true parameter $\theta_i$ of the cascade model. Furthermore, if we can estimate $\theta_i$ to a sufficiently good accuracy, it is then possible to recover the support of $\theta_i$ by simple thresholding, which provides a solution to the standard -Graph Inference problem. +Network Inference problem. We will first give results in the exactly sparse setting in which $\theta_i$ has a support of size exactly $s$. We will then relax this sparsity constraint @@ -24,8 +24,8 @@ write $S\defeq\text{supp}(\theta^*)$ and $s=|S|$. Recall, that $\theta_i$ is the vector of weights for all edges \emph{directed at} the node we are solving for. In other words, $S$ is the set of all nodes susceptible to influence node $i$, also referred to as its parents. Our main theorem will rely on the now standard -\emph{restricted eigenvalue condition}. -{\color{red} Add reference} +\emph{restricted eigenvalue condition} introduced +by~\cite{bickel2009simultaneous}. \begin{definition} Let $\Sigma\in\mathcal{S}_m(\reals)$ be a real symmetric matrix and $S$ be @@ -119,13 +119,12 @@ $\hat{\theta}_\lambda$ is the solution of \eqref{eq:pre-mle}: To prove Theorem~\ref{thm:main}, we apply Lemma~\ref{lem:negahban} with $\tau_{\mathcal{L}}=0$. Since $\mathcal{L}$ is twice differentiable and convex, -assumption \eqref{eq:rc} with $\kappa_{\mathcal{L}}=\frac{\gamma}{2}$ is -implied by the (RE)-condition.. The upper bound -on the $\ell_{\infty}$ norm of $\nabla\mathcal{L}(\theta^*)$ is given by -Lemma~\ref{lem:ub}. +assumption \eqref{eq:rc} with $\kappa_{\mathcal{L}}=\frac{\gamma}{2}$ is implied +by the (RE)-condition. For a good convergence rate, we must find the smallest +possible value of $\lambda$ such that $\lambda \geq 2 +\|\nabla\mathcal{L}\theta^*\|_{\infty}$. The upper bound on the $\ell_{\infty}$ +norm of $\nabla\mathcal{L}(\theta^*)$ is given by Lemma~\ref{lem:ub}. -{\color{red} explain usefulness/interpretation and contribution} -{\color{red} Sketch proof, full proof in appendix} \begin{lemma} \label{lem:ub} Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$: @@ -137,12 +136,12 @@ Assume {\bf(LF)} holds for some $\alpha>0$. For any $\delta\in(0,1)$: \end{displaymath} \end{lemma} - - -Note how the proof of Lemma~\ref{lem:ub} relied crucially on Azuma-Hoeffding's -inequality, which allows us to handle correlated observations. We now show how +The proof of Lemma~\ref{lem:ub} relies crucially on Azuma-Hoeffding's +inequality, which allows us to handle correlated observations. This departs from +the usual assumptions made in sparse recovery settings, where the sequence of +measurements are assumed to be independent from one another. We now show how to use Theorem~\ref{thm:main} to recover the support of $\theta^*$, that is, to -solve the Graph Inference problem. +solve the Network Inference problem. \begin{corollary} \label{cor:variable_selection} @@ -168,7 +167,7 @@ contradiction. Therefore we get no false positives. If $\theta_i^* > \eta + \end{proof} Assuming we know a lower bound $\alpha$ on $\Theta_{i,j}$, -Corollary~\ref{cor:variable_selection} can be applied to the Graph Inference +Corollary~\ref{cor:variable_selection} can be applied to the Network Inference problem in the following manner: pick $\epsilon = \frac{\eta}{2}$ and $\eta = \frac{\alpha}{3}$, then $S_{\eta+\epsilon}^* = S$ provided that $n=\Omega\left(\frac{s\log m}{\alpha^3\gamma^2}\right)$. That is, the support @@ -198,7 +197,6 @@ In other words, the closer $\theta^*$ is to being sparse, the smaller the price, and we recover the results of Section~\ref{sec:main_theorem} in the limit of exact sparsity. These results are formalized in the following theorem, which is also a consequence of Theorem 1 in \cite{Negahban:2009}. -{\color{red} Include full proof in appendix} \begin{theorem} \label{thm:approx_sparse} @@ -238,9 +236,10 @@ then similarly: ${\cal S}^*_{\eta + \epsilon} \subset \hat {\cal S}_\eta \subsection{Restricted Eigenvalue Condition} \label{sec:re} -In this section, we discuss the main assumption of Theorem~\ref{thm:main} -namely the restricted eigenvalue condition. We then compare to the -irrepresentability condition considered in \cite{Daneshmand:2014}. +In this section, we discuss the main assumption of Theorem~\ref{thm:main} namely +the restricted eigenvalue condition introduced by~\cite{bickel2009simultaneous}. +We then compare to the irrepresentability condition considered in +\cite{Daneshmand:2014}. \paragraph{Interpretation} @@ -248,26 +247,30 @@ There exists a large class of sufficient conditions under which sparse recovery is achievable in the context of regularized estimation. A good survey of these different assumptions can be found in \cite{vandegeer:2009}. -The restricted eigenvalue condition introduce in \cite{bickel:2009} is one of -the weakest such assumption. It can be interpreted as a restricted form of -non-degeneracy. Since we apply it to the Hessian of the log-likelihood function -$\nabla^2 \mathcal{L}(\theta)$, it essentially reduces to a form of restricted -strong convexity, that Lemma~\ref{lem:negahban} ultimately relies on. - -In our case we have: +The restricted eigenvalue condition is one of the weakest such assumption. It +can be interpreted as a restricted form of non-degeneracy. Since we apply it to +the Hessian of the log-likelihood function $\nabla^2 \mathcal{L}(\theta)$, it +essentially reduces to a form of restricted strong convexity, that +Lemma~\ref{lem:negahban} ultimately relies on. Observe that the Hessian of +$\mathcal{L}$ can be seen as a re-weighted Gram matrix of the observations: \begin{multline*} \nabla^2\mathcal{L}(\theta^*) = \frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}}x^t(x^t)^T \bigg[x_i^{t+1}\frac{f''f-f'^2}{f^2}(\inprod{\theta^*}{x^t})\\ -(1-x_i^{t+1})\frac{f''(1-f) + f'^2}{(1-f)^2}(\inprod{\theta^*}{x^t})\bigg] \end{multline*} -Observe that the Hessian of $\mathcal{L}$ can be seen as -a re-weighted Gram matrix of the observations. In other words, the restricted -eigenvalue condition sates that the observed set of active nodes are not -too collinear with each other. - -{\color{red} if the function is strictly log-convex, then equivalent -> explain -what the gram matrix is (explanation)} +If $f$ and $1-f$ are $c$-strictly log-convex~\cite{bagnoli2005log} for $c>0$, +then +\begin{displaymath} + \min\left(\frac{f''f-f'^2}{f^2}(x), \frac{f''(1-f) + f'^2}{(1-f)^2}(x) \right) + \geq c +\end{displaymath} +and the $(S, \gamma)$-({\bf RE}) condition on the Hessian in +Theorem~\ref{thm:main} and Theorem~\ref{thm:approx_sparse} reduces to a +condition on the \emph{gram matrix} of the observations $X^T X = +\frac{1}{|\mathcal{T}|}\sum_{t\in\mathcal{T}}x^t(x^t)^T$ with $\gamma \leftarrow +\gamma\cdot c$. The (RE)-condition is then equivalent to the assumption made in +the Lasso analysis of~\cite{bickel:2009}. \paragraph{(RE) with high probability} @@ -278,17 +281,24 @@ this probabilistic model. Several recent papers show that large classes of correlated designs obey the restricted eigenvalue property with high probability \cite{raskutti:10, rudelson:13}. -In our case, we can show that if (RE)-condition holds for the expected Hessian -matrix $\E[\nabla^2\mathcal{L}(\theta^*)]$, then it holds for the finite sample -Hessian matrix $\nabla^2\mathcal{L}(\theta^*)$ with high probability. Note that -the expected Hessian matrix is exactly the Fisher Information matrix of the -generalized linear cascade model which captures the amount of information about -$\theta^*$ conveyed by the random observations. Therefore, under an assumption -which only involves the probabilistic model and not the actual observations, -Proposition~\ref{prop:fi} allows us to draw the same conclusion as in -Theorem~\ref{thm:main}. +The (RE)-condition is well-behaved in the following sense: if it holds for the +expected Hessian matrix $\E[\nabla^2\mathcal{L}(\theta^*)]$, then it holds for +the finite sample Hessian matrix $\nabla^2\mathcal{L}(\theta^*)$ with high +probability. -We will need the following additional assumptions on the inverse link function $f$: +Note that the expected Hessian matrix is exactly the Fisher +Information matrix of the generalized linear cascade model which captures the +amount of information about $\theta^*$ conveyed by the random observations. +The (RE)-condition has an even more natural interpretation when $f$ and $1-f$ +are strictly log-convex, since the expected \emph{gram matrix} $A \equiv +\mathbb{E}[X^T X]$ is a matrix whose entry $a_{i,j}$ is the average number of +times node $i$ and node $j$ are infected at the same time in the cascade +process, and $a_{i,i}$ is the average number of times node $i$ is infected. + +Therefore, under an assumption which only involves the probabilistic model and +not the actual observations, Proposition~\ref{prop:fi} allows us to draw the +same conclusion as in Theorem~\ref{thm:main}. We will need the following +additional assumptions on the inverse link function $f$: \begin{equation} \tag{LF2} \|f'\|_{\infty} \leq M @@ -309,73 +319,36 @@ again non restrictive in the (IC) model and (V) model. condition, w.p $\geq 1-e^{-n^\delta\log m}$. \end{proposition} -{\color{red} sketch proof, full (AND BETTER) proof in appendix} -\begin{proof}Writing $H\defeq \nabla^2\mathcal{L}(\theta^*)$, if - $ \forall\Delta\in C(S),\; - \|\E[H] - H]\|_\infty\leq \lambda $ - and $\E[H]$ verifies the $(S,\gamma)$-(RE) - condition then: - \begin{equation} - \label{eq:foo} - \forall \Delta\in C(S),\; - \Delta H\Delta \geq - \Delta \E[H]\Delta(1-32s\lambda/\gamma) - \end{equation} - Indeed, $ - |\Delta(H-E[H])\Delta| \leq 2\lambda \|\Delta\|_1^2\leq - 2\lambda(4\sqrt{s}\|\Delta_s\|_2)^2 - $. - Writing - $\partial^2_{i,j}\mathcal{L}(\theta^*)=\frac{1}{|\mathcal{T}|}\sum_{t\in - T}Y_t$ and using $(LF)$ and $(LF2)$ we have $\big|Y_t - \E[Y_t]\big|\leq - \frac{4(M+2)}{\alpha}$. - Applying Azuma's inequality as in the proof of Lemma~\ref{lem:ub}, this - implies: - \begin{displaymath} - \P\big[\|\E[H]-H\|_{\infty}\geq\lambda\big] \leq - 2\exp\left(-\frac{n\alpha\lambda^2}{4(M+2)} + 2\log m\right) - \end{displaymath} - Thus, if we take $\lambda=\sqrt{\frac{12(M+2)\log m}{\alpha - n^{1-\delta}}}$, $\|E[H]-H\|_{\infty}\leq\lambda$ w.p at least - $1-e^{-n^{\delta}\log m}$. When $n^{1-\delta}\geq - \frac{M+2}{21\gamma\alpha}s^2\log m$, \eqref{eq:foo} implies - $ - \forall \Delta\in C(S),\; - \Delta H\Delta \geq \frac{1}{2} \Delta \E[H]\Delta, - $ w.p. at least $1-e^{-n^{\delta}\log m}$ and the conclusion of - Proposition~\ref{prop:fi} follows. -\end{proof} - Observe that the number of measurements required in Proposition~\ref{prop:fi} is now quadratic in $s$. If we only keep the first measurement from each cascade which are independent, we can apply Theorem 1.8 from \cite{rudelson:13}, lowering the number of required cascades to $s\log m \log^3( s\log m)$. -\paragraph{(RE) vs Irrepresentability Condition} +%\paragraph{(RE) vs Irrepresentability Condition} -\cite{Daneshmand:2014} rely on an `incoherence' condition on the hessian of the -likelihood function also known as the {\it (S,s)-irrepresentability} condition. +%\cite{Daneshmand:2014} rely on an `incoherence' condition on the hessian of the +%likelihood function also known as the {\it (S,s)-irrepresentability} condition. -\begin{comment} -\begin{definition} -Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and -$S^c$ be the set of indices of all the parents and non-parents respectively and -$Q_{S,S}$, $Q_{S^c,S}$, $Q_{S, S^c}$, and $Q_{S^c, S^c}$ the induced -sub-matrices. Consider the following constant: -\begin{equation} -\nu_{\text{irrepresentable}}(S) \defeq \max_{\tau \in \mathbb{R}^p \ :\ \| \tau -\|_{\infty} \leq 1} \|Q_{S^c, S} Q_{S, S}^{-1} \tau\|_{\infty} -\end{equation} -The (S,s)-irrepresentability holds if $\nu_{\text{irrepresentable}}(S) < 1 - -\epsilon$ for $\epsilon > 0$ -\end{definition} -\end{comment} +%\begin{comment} +%\begin{definition} +%Following similar notation, let $Q^* \defeq \nabla^2 f(\theta^*)$. Let $S$ and +%$S^c$ be the set of indices of all the parents and non-parents respectively and +%$Q_{S,S}$, $Q_{S^c,S}$, $Q_{S, S^c}$, and $Q_{S^c, S^c}$ the induced +%sub-matrices. Consider the following constant: +%\begin{equation} +%\nu_{\text{irrepresentable}}(S) \defeq \max_{\tau \in \mathbb{R}^p \ :\ \| \tau +%\|_{\infty} \leq 1} \|Q_{S^c, S} Q_{S, S}^{-1} \tau\|_{\infty} +%\end{equation} +%The (S,s)-irrepresentability holds if $\nu_{\text{irrepresentable}}(S) < 1 - +%\epsilon$ for $\epsilon > 0$ +%\end{definition} +%\end{comment} -It is possible to construct examples where the (RE) condition holds but not the -irrepresentability condition \cite{vandegeer:2009}. Theorem 9.1 from the same -paper shows that a `strong' irrepresentability condition directly {\it implies} -the {\bf(RE)} condition for $\ell_2$-recovery. +%It is possible to construct examples where the (RE) condition holds but not the +%irrepresentability condition \cite{vandegeer:2009}. Theorem 9.1 from the same +%paper shows that a `strong' irrepresentability condition directly {\it implies} +%the {\bf(RE)} condition for $\ell_2$-recovery. \begin{comment} \begin{proposition} @@ -388,53 +361,53 @@ the smallest eigenvalue of $Q^*_{S,S}$, on which the results of \end{proposition} \end{comment} -\begin{comment} -Furthermore, recent papers \cite{vandegeer:2011}, \cite{Zou:2006}, argue that -the irrepresentability condition is unrealistic in situations where there is -a correlation between variables. Consider the following simplified example from -\cite{vandegeer:2011}: -\begin{equation} -\nonumber -\left( -\begin{array}{cccc} -I_s & \rho J \\ -\rho J & I_s \\ -\end{array} -\right) -\end{equation} -where $I_s$ is the $s \times s$ identity matrix, $J$ is the all-ones matrix and -$\rho \in \mathbb{R}^+$. It is easy to see that $\nu_{\text{irrepresentable}}(S) -= \rho s$ and $\lambda_{\min}(Q) \geq 1 - \rho$, such that for any $\rho > -\frac{1}{s}$ and $\rho < 1$, the restricted eigenvalue holds trivially but the -(S,s)-irrepresentability does not hold. +%\begin{comment} +%Furthermore, recent papers \cite{vandegeer:2011}, \cite{Zou:2006}, argue that +%the irrepresentability condition is unrealistic in situations where there is +%a correlation between variables. Consider the following simplified example from +%\cite{vandegeer:2011}: +%\begin{equation} +%\nonumber +%\left( +%\begin{array}{cccc} +%I_s & \rho J \\ +%\rho J & I_s \\ +%\end{array} +%\right) +%\end{equation} +%where $I_s$ is the $s \times s$ identity matrix, $J$ is the all-ones matrix and +%$\rho \in \mathbb{R}^+$. It is easy to see that $\nu_{\text{irrepresentable}}(S) +%= \rho s$ and $\lambda_{\min}(Q) \geq 1 - \rho$, such that for any $\rho > +%\frac{1}{s}$ and $\rho < 1$, the restricted eigenvalue holds trivially but the +%(S,s)-irrepresentability does not hold. -\begin{lemma} -Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in -\mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal -R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where -$\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar -{\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that -${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0, -\; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* + -\Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle -\geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal -M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$. -Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where -${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda - -\theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) + -\frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal -R}(\theta^*_{{\cal M}^\perp}\}$$ -\end{lemma} +%\begin{lemma} +%Let ${\cal C}({\cal M}, \bar {\cal M}^\perp, \theta^*) \defeq \{ \Delta \in +%\mathbb{R}^p | {\cal R}(\Delta_{\bar {\cal M}^\perp} \leq 3 {\cal +%R}(\Delta_{\bar {\cal M}} + 4 {\cal R}(\theta^*_{{\cal M}^\perp}) \}$, where +%$\cal R$ is a \emph{decomposable} regularizer with respect to $({\cal M}, \bar +%{\cal M}^\perp)$, and $({\cal M}, \bar {\cal M})$ are two subspaces such that +%${\cal M} \subseteq \bar {\cal M}$. Suppose that $\exists \kappa_{\cal L} > 0, +%\; \exists \tau_{\cal L}, \; \forall \Delta \in {\cal C}, \; {\cal L}(\theta^* + +%\Delta) - {\cal L}(\theta^*) - \langle \Delta {\cal L}(\theta^*), \Delta \rangle +%\geq \kappa_{\cal L} \|\Delta\|^2 - \tau_{\cal L}^2(\theta^*)$. Let $\Psi({\cal +%M}) \defeq \sup_{u \in {\cal M} \backslash \{0\}} \frac{{\cal R}(u)}{\|u\|}$. +%Finally suppose that $\lambda \geq 2 {\cal R}(\nabla {\cal L}(\theta^*))$, where +%${\cal R}^*$ is the conjugate of ${\cal R}$. Then: $$\|\hat \theta_\lambda - +%\theta^* \|^2 \leq 9 \frac{\lambda^2}{\kappa_{\cal L}}\Psi^2(\bar {\cal M}) + +%\frac{\lambda}{\kappa_{\cal L}}\{2 \tau^2_{\cal L}(\theta^*) + 4 {\cal +%R}(\theta^*_{{\cal M}^\perp}\}$$ +%\end{lemma} -\subsection{The Independent Cascade Model} +%\subsection{The Independent Cascade Model} -Recall that to cast the Independent Cascade model as a Generalized Linear -Cascade, we performed the following change of variables: $\forall i,j -\ \Theta_{i,j} = \log(1 - p_{i,j})$. The previous results hold \emph{a priori} -for the ``effective'' parameter $\Theta$. The following lemma shows they also -hold for the original infection probabilities $p_{i,j}$: +%Recall that to cast the Independent Cascade model as a Generalized Linear +%Cascade, we performed the following change of variables: $\forall i,j +%\ \Theta_{i,j} = \log(1 - p_{i,j})$. The previous results hold \emph{a priori} +%for the ``effective'' parameter $\Theta$. The following lemma shows they also +%hold for the original infection probabilities $p_{i,j}$: -The results of sections~\ref{sec:main_theorem} and \ref{sec:relaxing_sparsity} -therefore hold for the original transition probabilities $p$. -\end{comment} +%The results of sections~\ref{sec:main_theorem} and \ref{sec:relaxing_sparsity} +%therefore hold for the original transition probabilities $p$. +%\end{comment} diff --git a/paper/sparse.bib b/paper/sparse.bib index 1dd7589..4aee818 100644 --- a/paper/sparse.bib +++ b/paper/sparse.bib @@ -14,7 +14,8 @@ year = {2006}, @inproceedings{GomezRodriguez:2010, author = {Gomez Rodriguez, Manuel and Leskovec, Jure and Krause, Andreas}, title = {Inferring Networks of Diffusion and Influence}, - booktitle = {Proceedings of the 16th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, + booktitle = {Proceedings of the 16th ACM SIGKDD International Conference on + Knowledge Discovery and Data Mining}, series = {KDD '10}, year = {2010}, isbn = {978-1-4503-0055-1}, @@ -141,7 +142,8 @@ fjournal = "Electronic Journal of Statistics", journal = "Electron. J. Statist.", pages = "688--749", publisher = "The Institute of Mathematical Statistics and the Bernoulli Society", -title = "The adaptive and the thresholded Lasso for potentially misspecified models (and a lower bound for the Lasso)", +title = "The adaptive and the thresholded Lasso for potentially misspecified + models (and a lower bound for the Lasso)", url = "http://dx.doi.org/10.1214/11-EJS624", volume = "5", year = "2011" @@ -164,8 +166,8 @@ URL = {http://dx.doi.org/10.1198/016214506000000735}, Jason N. Laska and Petros T. Boufounos and Richard G. Baraniuk}, - title = {Robust 1-Bit Compressive Sensing via Binary Stable Embeddings of Sparse - Vectors}, + title = {Robust 1-Bit Compressive Sensing via Binary Stable Embeddings of + Sparse Vectors}, journal = {{IEEE} Transactions on Information Theory}, volume = {59}, number = {4}, @@ -197,7 +199,8 @@ URL = {http://dx.doi.org/10.1198/016214506000000735}, author = {Ankit Gupta and Robert Nowak and Benjamin Recht}, - title = {Sample complexity for 1-bit compressed sensing and sparse classification}, + title = {Sample complexity for 1-bit compressed sensing and sparse + classification}, booktitle = {{IEEE} International Symposium on Information Theory, {ISIT} 2010, June 13-18, 2010, Austin, Texas, USA, Proceedings}, pages = {1553--1557}, @@ -290,8 +293,8 @@ year = "2009" author = {Eric Price and David P. Woodruff}, title = {{(1} + eps)-Approximate Sparse Recovery}, - booktitle = {{IEEE} 52nd Annual Symposium on Foundations of Computer Science, {FOCS} - 2011, Palm Springs, CA, USA, October 22-25, 2011}, + booktitle = {{IEEE} 52nd Annual Symposium on Foundations of Computer Science, + {FOCS} 2011, Palm Springs, CA, USA, October 22-25, 2011}, pages = {295--304}, year = {2011}, crossref = {DBLP:conf/focs/2011}, @@ -318,8 +321,8 @@ year = "2009" @inproceedings{pw12, author = {Eric Price and David P. Woodruff}, - title = {Applications of the Shannon-Hartley theorem to data streams and sparse - recovery}, + title = {Applications of the Shannon-Hartley theorem to data streams and + sparse recovery}, booktitle = {Proceedings of the 2012 {IEEE} International Symposium on Information Theory, {ISIT} 2012, Cambridge, MA, USA, July 1-6, 2012}, pages = {2446--2450}, @@ -481,9 +484,11 @@ year = "2009" } @article{zhang2014, - title={Confidence intervals for low dimensional parameters in high dimensional linear models}, + title={Confidence intervals for low dimensional parameters in high dimensional + linear models}, author={Zhang, Cun-Hui and Zhang, Stephanie S}, - journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, + journal={Journal of the Royal Statistical Society: Series B (Statistical + Methodology)}, volume={76}, number={1}, pages={217--242}, @@ -492,7 +497,8 @@ year = "2009" } @article{javanmard2014, - title={Confidence intervals and hypothesis testing for high-dimensional regression}, + title={Confidence intervals and hypothesis testing for high-dimensional + regression}, author={Javanmard, Adel and Montanari, Andrea}, journal={The Journal of Machine Learning Research}, volume={15}, @@ -514,7 +520,8 @@ year = "2009" } @article{candes2006near, - title={Near-optimal signal recovery from random projections: Universal encoding strategies?}, + title={Near-optimal signal recovery from random projections: Universal + encoding strategies?}, author={Candes, Emmanuel J and Tao, Terence}, journal={Information Theory, IEEE Transactions on}, volume={52}, @@ -523,3 +530,23 @@ year = "2009" year={2006}, publisher={IEEE} } + +@article{bickel2009simultaneous, + title={Simultaneous analysis of Lasso and Dantzig selector}, + author={Bickel, Peter J and Ritov, Ya'acov and Tsybakov, Alexandre B}, + journal={The Annals of Statistics}, + pages={1705--1732}, + year={2009}, + publisher={JSTOR} +} + +@article{bagnoli2005log, + title={Log-concave probability and its applications}, + author={Bagnoli, Mark and Bergstrom, Ted}, + journal={Economic theory}, + volume={26}, + number={2}, + pages={445--469}, + year={2005}, + publisher={Springer} +} |
