aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2015-12-11 17:53:07 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2015-12-11 17:53:07 -0500
commit8fd65558f9149d4161dbedad6d250dea30c1d239 (patch)
treea7d5e179744e2b8c8ea80045addbcf50ff2dab16
parent41c645980e71a22c37beecfc8eac293752de7755 (diff)
downloadcascades-8fd65558f9149d4161dbedad6d250dea30c1d239.tar.gz
Add "proof" of active learning and fix missing command and citation
-rw-r--r--finale/final_report.tex1
-rw-r--r--finale/sections/active.tex59
-rw-r--r--finale/sections/bayesian.tex1
-rw-r--r--finale/sparse.bib9
4 files changed, 59 insertions, 11 deletions
diff --git a/finale/final_report.tex b/finale/final_report.tex
index 159badc..5f4e441 100644
--- a/finale/final_report.tex
+++ b/finale/final_report.tex
@@ -25,6 +25,7 @@
\newcommand{\bt}{\boldsymbol{\theta}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\cl}[1]{\text{\textbf{#1}}}
+\newcommand{\var}{\text{Var}}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
\newtheorem{theorem}{Theorem}
diff --git a/finale/sections/active.tex b/finale/sections/active.tex
index 43ed75d..7b9b390 100644
--- a/finale/sections/active.tex
+++ b/finale/sections/active.tex
@@ -44,24 +44,61 @@ $\tilde{U}$:
\begin{displaymath}
\tilde{U}(i) = \sum_{j\in V} \var(\Theta_{i,j}),\quad i\in V
\end{displaymath}
-The justification of this surrogate utility function is given by
-\cref{prop:active} which shows that maximizing $\tilde{U}$ implies maximizing
-a lower bound on $U$. Note that when doing Variational Inference with Gaussian
-approximations as described in \cref{sec:inference}, then computing $\tilde{U}$
-simply involves summing one row in the matrix of the current estimates of the
-variances.
+This surrogate utility function is partially motivated by \cref{prop:active}
+which shows that $U(i)$ can be lower-bounded by a simple node-level criterion
+expressed as a sum over the outgoing edges of node $i$.
+
+From the computational perspective $\tilde{U}$ is much simpler to compute than
+$U$. In particular, in cases where the Bayesian Inference method used maintains
+the matrix of variances for the approximate posterior distribution as described
+in \cref{sec:example}, then computing $\tilde{U}$ simply involves summing this
+matrix along the rows.
\begin{proposition}
- \label{prop:active} For all $i\in V$, $\tilde{U}(i) \leq U(i)$.
+ \label{prop:active}
+ Assume that the prior distribution $\Theta$ has a product form, then:
+ \begin{displaymath}
+ U(i) \geq \sum_{j\in V} I\big(\Theta_{i,j},
+ \mathcal{B}(\Theta_{i,j})\big)
+ \quad i\in V
+ \end{displaymath}
+ where $\mathcal{B}(\Theta_{i,j})$ denotes a Bernouilli variable of
+ parameter $\Theta_{i,j}$.
\end{proposition}
-\begin{proof} \end{proof}
+\begin{proof}
+The data processing inequality implies that for any function $f$ and any two
+random variables $X$ and $Y$:
+\begin{displaymath}
+ I(X, Y) \geq I(X, f(Y)).
+\end{displaymath}
+Applying this inequality to $\Theta$ and $\bx$ with $f:\bx\mapsto x^1$
+which truncates a cascade to its first time step, we get:
+\begin{displaymath}
+ U(i) \geq I(\Theta, x^1\,|\, x^0 = e_i)
+\end{displaymath}
+Using the chain rule for mutual information and the fact that the coordinates
+of $x^1_j$ are mutually independent:
+\begin{displaymath}
+ U(i) \geq \sum_{j\in V} I(\Theta, x^1_j\,|\, x^0 = e_i)
+\end{displaymath}
+Since $\Theta$ has a product form and since $x^1_j$ only depends on
+$\Theta_{i,j}$ (conditioned on $x^0 = e_i$) we can further write:
+\begin{displaymath}
+ \begin{split}
+ U(i)&\geq \sum_{j\in V} I(\Theta_{i,j}, x^1_j\,|\,x^0 = e_i)\\
+\end{split}
+\end{displaymath}
+This completes the proof since conditioned on $x^0 = e_i$, $x^1_j$ is
+a Bernouilli variable of parameter $\Theta_{i,j}$.
+\end{proof}
\begin{remark} In the first part of the proof, we show that truncating
a cascade to keep only the first time step is a lower bound on the complete
- mutual information. It has been noted in a different context \cite{} that
- for the purpose of graph inference, the first step of a cascade is the most
- informative.
+ mutual information. It has been noted in \cite{Abrahao:13}, although for
+ a slightly different cascade model, that for the purpose of graph inference
+ the head of a cascade is the most informative. This further motivates our
+ proposed approximation of $U$.
\end{remark}
\paragraph{Online Bayesian Updates} bullshit on SGD on data streams. Cite
diff --git a/finale/sections/bayesian.tex b/finale/sections/bayesian.tex
index 6e44b08..dda5c81 100644
--- a/finale/sections/bayesian.tex
+++ b/finale/sections/bayesian.tex
@@ -91,6 +91,7 @@ linear/quadratic approximation to the log-likelihood for which the expectation
term can be written analytically.
\subsection{Example}
+\label{sec:example}
We develop the Variational Inference algorithm for the IC model (cf.
Section~\ref{sec:model}). As mentioned above, the IC model with link function
$f: z \mapsto 1 - e^{-z}$ has no conjugate priors. We consider here a truncated
diff --git a/finale/sparse.bib b/finale/sparse.bib
index d2487c1..be276cb 100644
--- a/finale/sparse.bib
+++ b/finale/sparse.bib
@@ -1,3 +1,12 @@
+@article{chaloner,
+ title={Bayesian experimental design: A review},
+ author={Chaloner, Kathryn and Verdinelli, Isabella},
+ journal={Statistical Science},
+ pages={273--304},
+ year={1995},
+ publisher={JSTOR}
+}
+
@article {CandesRomberTao:2006,
author = {Candès, Emmanuel J. and Romberg, Justin K. and Tao, Terence},
title = {Stable signal recovery from incomplete and inaccurate measurements},