2 files changed, 49 insertions, 11 deletions
diff --git a/finale/sections/active.tex b/finale/sections/active.tex
index 43ed75d..7b9b390 100644
--- a/finale/sections/active.tex
+++ b/finale/sections/active.tex
@@ -44,24 +44,61 @@ $\tilde{U}$:
 \begin{displaymath}
     \tilde{U}(i) = \sum_{j\in V} \var(\Theta_{i,j}),\quad i\in V
 \end{displaymath}
-The justification of this surrogate utility function is given by
-\cref{prop:active} which shows that maximizing $\tilde{U}$ implies maximizing
-a lower bound on $U$. Note that when doing Variational Inference with Gaussian
-approximations as described in \cref{sec:inference}, then computing $\tilde{U}$
-simply involves summing one row in the matrix of the current estimates of the
-variances.
+This surrogate utility function is partially motivated by \cref{prop:active}
+which shows that $U(i)$ can be lower-bounded by a simple node-level criterion
+expressed as a sum over the outgoing edges of node $i$.
+
+From the computational perspective $\tilde{U}$ is much simpler to compute than
+$U$. In particular, in cases where the Bayesian Inference method used maintains
+the matrix of variances for the approximate posterior distribution as described
+in \cref{sec:example}, then computing $\tilde{U}$ simply involves summing this
+matrix along the rows.
 
 \begin{proposition}
-    \label{prop:active} For all $i\in V$, $\tilde{U}(i) \leq U(i)$.
+    \label{prop:active}
+    Assume that the prior distribution $\Theta$ has a product form, then: 
+    \begin{displaymath}
+        U(i) \geq \sum_{j\in V} I\big(\Theta_{i,j},
+        \mathcal{B}(\Theta_{i,j})\big)
+        \quad i\in V
+    \end{displaymath}
+    where $\mathcal{B}(\Theta_{i,j})$ denotes a Bernouilli variable of
+    parameter $\Theta_{i,j}$.
 \end{proposition}
 
-\begin{proof} \end{proof}
+\begin{proof}
+The data processing inequality implies that for any function $f$ and any two
+random variables $X$ and $Y$:
+\begin{displaymath}
+    I(X, Y) \geq I(X, f(Y)).
+\end{displaymath}
+Applying this inequality to $\Theta$ and $\bx$ with $f:\bx\mapsto x^1$
+which truncates a cascade to its first time step, we get:
+\begin{displaymath}
+    U(i) \geq I(\Theta, x^1\,|\, x^0 = e_i)
+\end{displaymath}
+Using the chain rule for mutual information and the fact that the coordinates
+of $x^1_j$ are mutually independent:
+\begin{displaymath}
+    U(i) \geq \sum_{j\in V} I(\Theta, x^1_j\,|\, x^0 = e_i)
+\end{displaymath}
+Since $\Theta$ has a product form and since $x^1_j$ only depends on
+$\Theta_{i,j}$ (conditioned on $x^0 = e_i$) we can further write:
+\begin{displaymath}
+    \begin{split}
+        U(i)&\geq \sum_{j\in V} I(\Theta_{i,j}, x^1_j\,|\,x^0 = e_i)\\
+\end{split}
+\end{displaymath}
+This completes the proof since conditioned on $x^0 = e_i$, $x^1_j$ is
+a Bernouilli variable of parameter $\Theta_{i,j}$.
+\end{proof}
 
 \begin{remark} In the first part of the proof, we show that truncating
     a cascade to keep only the first time step is a lower bound on the complete
-    mutual information. It has been noted in a different context \cite{} that
-    for the purpose of graph inference, the first step of a cascade is the most
-    informative.
+    mutual information. It has been noted in \cite{Abrahao:13}, although for
+    a slightly different cascade model, that for the purpose of graph inference
+    the head of a cascade is the most informative. This further motivates our
+    proposed approximation of $U$.
 \end{remark}
 
 \paragraph{Online Bayesian Updates} bullshit on SGD on data streams. Cite
diff --git a/finale/sections/bayesian.tex b/finale/sections/bayesian.tex
index 6e44b08..dda5c81 100644
--- a/finale/sections/bayesian.tex
+++ b/finale/sections/bayesian.tex
@@ -91,6 +91,7 @@ linear/quadratic approximation to the log-likelihood for which the expectation
 term can be written analytically.
 
 \subsection{Example}
+\label{sec:example}
 We develop the Variational Inference algorithm for the IC model (cf.
 Section~\ref{sec:model}).  As mentioned above, the IC model with link function
 $f: z \mapsto 1 - e^{-z}$ has no conjugate priors. We consider here a truncated