Add thresholding algorithm and beginning of beta optimization

author: Thibaut Horel <thibaut.horel@gmail.com> 2015-07-03 07:57:31 -0700
committer: Thibaut Horel <thibaut.horel@gmail.com> 2015-07-03 07:57:31 -0700
commit: b91ab29e267fb58b8ba161c1f77da5919b0e62d9 (patch)
tree: 619637077449ee6115103f9a605c33ded3256873 /theory/main.tex
parent: 58ed50d980a1ba240bb50b27c42db0a679f00b43 (diff)
download: criminal_cascades-b91ab29e267fb58b8ba161c1f77da5919b0e62d9.tar.gz
1 files changed, 82 insertions, 9 deletions
diff --git a/theory/main.tex b/theory/main.tex
index 1b4f92a..eb59c6e 100644
--- a/theory/main.tex
+++ b/theory/main.tex
@@ -259,7 +259,7 @@ and a random forest $\mathcal{T} = (T_1,\ldots, T_r)$, we can compute the
 probability of observing the infection times given by the vector $\textbf{t}$:
 \begin{equation}
     \label{eq:model}
-    \P[\mathbf{t}\,|\, \mathcal{T}, \beta, \alpha, \lambda]
+    \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda]
     = \beta^r(1-\beta)^{n-r}\prod_{e\in E_{\mathcal{T}}} p_e
     \prod_{e\in E_{D_{\mathcal{\mathbf{t}}}}\setminus E_{\mathcal{T}}}
     \tilde{p}_e
@@ -270,16 +270,24 @@ in Section~\ref{sec:dag}.
 
 \section{Maximum Likelihood Inference}
 
-In this section, we assume that we observe a vector of infection times
-$\mathbf{t}$ and we are interested in recovering the parameters $\mathcal{T},
-\beta, \alpha, \lambda$ maximizing the likelihood of our observations as given
-by the model \eqref{eq:model}. That is, we want to solve the following
-optimization problem:
+The probabilistic model \eqref{eq:model} defines the joint probability of
+a pair $\mathbf{t}, \mathcal{T}$ of infection times and cascades. However, we
+are interested in situations where only the infection times are observed.
+
+A direct application of maximum likelihood estimation would recover the
+parameters $\beta, \alpha$ and $\lambda$ by solving the following optimization
+program:
+\begin{displaymath}
+    \max_{\alpha,\beta,\lambda}\sum_{\mathcal{T}}
+    \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda]
+\end{displaymath}
+However, since we are also interested in recovering the infection edges, we
+solve the following optimization program instead:
 \begin{equation}
     \label{eq:program}
     \mathcal{T}^*, \beta^*, \alpha^*, \lambda^* \in
     \argmax_{\mathcal{T},\beta,\alpha,\lambda}
-    \P[\mathbf{t}\,|\, \mathcal{T}, \beta, \alpha, \lambda]
+    \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda]
 \end{equation}
 When solving this optimization program, the optimal forest $\mathcal{T}^*$ will
 indicate the most likely edges along which the infection could have propagated,
@@ -290,12 +298,12 @@ temporal and structural scale of pairwise infection.
 
 We first assume that $\beta$, $\alpha$ and $\lambda$ are fixed and solve
 \eqref{eq:program} for $\mathcal{T}$:
-\begin{displaymath}
+\begin{equation}\label{eq:program2}
     \mathcal{T}^*\in\argmax_{\mathcal{T}}
     \beta^r(1-\beta)^{n-r}\prod_{e\in E_{\mathcal{T}}} p_e
     \prod_{e\in E_{D_{\mathcal{\mathbf{t}}}}\setminus E_{\mathcal{T}}}
     \tilde{p}_e
-\end{displaymath}
+\end{equation}
 Note that the solution $\mathcal{T}^*$ depends on the value of $(\beta, \alpha,
 \lambda)$. We will then optimize over these parameters in the following
 sections.
@@ -317,8 +325,73 @@ equivalent optimization program:
     +\sum_{e\in E_{\mathcal{T}}} \log\frac{p_e}{\tilde{p}_e}
 \end{displaymath}
 
+As mentioned in Section XXX, $\mathcal{T}$ is a random forest spawning $R$, the
+set of victim nodes, in $D_{\mathbf{t}}$. So choosing $\mathcal{T}$ amounts to
+choosing for each node in $R$:
+\begin{itemize}
+    \item either to make it the root of a new tree, in which case it will
+        contribute a term $\log\frac{\beta}{1-\beta}$ to the objective
+        function.
+    \item either to make it part of an existing tree, by including one of its
+        incoming edge $e$ in the edge set $E_{\mathcal{T}}$. In this case, the
+        node contributes a term $\log\frac{p_e}{\tilde{p}_e}$ to the objective
+        function.
+\end{itemize}
+
+Since our goal is to solve \eqref{eq:program2}, in the second case, it is
+sufficient to consider the incoming edge which maximizes the quantity
+$\frac{p_e}{\tilde{p}_e}$. For this edge $e$, deciding between the two cases
+then amounts to comparing $\frac{p_e}{\tilde{p}_e}$ to $\frac{\beta}{1-\beta}$.
+This leads to a simple thresholding algorithm for solving \eqref{eq:program2}. 
+
+\begin{algorithm}
+    \caption{Thresholding algorithm for cascade recovery}
+    \begin{algorithmic}[1]
+    \Require graph $G$, infection times $\mathbf{t}$, $\alpha$, $\beta$, $\lambda$
+    \Ensure cascades $\mathcal{T}$
+    \State $D_{\mathbf{t}}\gets$ acyclic graph obtained by trimming $G$
+    \State $R\gets\{u\in G: t_u < \infty\}$
+    \State $E\gets\emptyset$
+    \For{$v\in R$}
+    \State $u\gets\argmax\left\{\frac{p_{u,v}}{\tilde{p}_{u,v}}: (u,v)\in
+    D_t\right\}$
+    \If{$\frac{p_{u,v}}{\tilde{p}_{u,v}}\geq \frac{\beta}{1-\beta}$}
+    \State $E\gets E\cup \{(u,v)\}$
+    \EndIf
+    \EndFor
+    \State \textbf{return} $\mathcal{T} = (R, E)$
+    \end{algorithmic}
+\end{algorithm}
+
 \subsection{Finding the Spawning Parameter}
 
+In this section, we consider the value of $\alpha$ and $\lambda$ fixed and try
+to optimize \eqref{eq:program} jointly over $\mathcal{T}$ and $\beta$.
+
+For a node $v\in R$, let us define:
+\begin{displaymath}
+    q_v \eqdef \max\left\{\frac{p_{u,v}}{\tilde{p}_{u,v}}:(u,v)\in E_{\mathbf{t}}\right\}
+\end{displaymath}
+and denote by $E_{\beta}$ the set:
+\begin{displaymath}
+    R_{\beta} \eqdef \left\{v\in R:q_v\geq \frac{\beta}{1-\beta}\right\}
+\end{displaymath}
+Then it follows from the analysis of Section XXX, that for a fixed value of
+$\beta$, the optimal value of \eqref{eq:program} when optimizing over
+$\mathcal{T}$ is given by:
+\begin{displaymath}
+    f(\beta) = (|R|-|R_\beta|)\log\frac{\beta}{1-\beta}
+    + n\log(1-\beta)+ \sum_{v\in R_\beta} \log q_v
+\end{displaymath}
+Hence our goal is to find the maximum of $f(\beta)$ over the interval $(0,1)$.
+Note that the maximum exists since $f$ diverges to $-\infty$ as $\beta$
+approaches $0$ or $1$. Furthermore $f$ is continuous over $(0,1)$ since we can
+rewrite it:
+\begin{displaymath}
+    f(\beta) = n\log(1-\beta) + \sum_{v\in R}
+    \log\max\left\{\frac{\beta}{1-\beta}, q_v\right\}
+\end{displaymath}
+
 \subsection{Finding the Pairwise Propagation Parameters}
 
 \end{document}
author	Thibaut Horel <thibaut.horel@gmail.com>	2015-07-03 07:57:31 -0700
committer	Thibaut Horel <thibaut.horel@gmail.com>	2015-07-03 07:57:31 -0700
commit	b91ab29e267fb58b8ba161c1f77da5919b0e62d9 (patch)
tree	619637077449ee6115103f9a605c33ded3256873 /theory/main.tex
parent	58ed50d980a1ba240bb50b27c42db0a679f00b43 (diff)
download	criminal_cascades-b91ab29e267fb58b8ba161c1f77da5919b0e62d9.tar.gz