summaryrefslogtreecommitdiffstats
path: root/theory/main.tex
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2015-07-03 07:57:31 -0700
committerThibaut Horel <thibaut.horel@gmail.com>2015-07-03 07:57:31 -0700
commitb91ab29e267fb58b8ba161c1f77da5919b0e62d9 (patch)
tree619637077449ee6115103f9a605c33ded3256873 /theory/main.tex
parent58ed50d980a1ba240bb50b27c42db0a679f00b43 (diff)
downloadcriminal_cascades-b91ab29e267fb58b8ba161c1f77da5919b0e62d9.tar.gz
Add thresholding algorithm and beginning of beta optimization
Diffstat (limited to 'theory/main.tex')
-rw-r--r--theory/main.tex91
1 files changed, 82 insertions, 9 deletions
diff --git a/theory/main.tex b/theory/main.tex
index 1b4f92a..eb59c6e 100644
--- a/theory/main.tex
+++ b/theory/main.tex
@@ -259,7 +259,7 @@ and a random forest $\mathcal{T} = (T_1,\ldots, T_r)$, we can compute the
probability of observing the infection times given by the vector $\textbf{t}$:
\begin{equation}
\label{eq:model}
- \P[\mathbf{t}\,|\, \mathcal{T}, \beta, \alpha, \lambda]
+ \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda]
= \beta^r(1-\beta)^{n-r}\prod_{e\in E_{\mathcal{T}}} p_e
\prod_{e\in E_{D_{\mathcal{\mathbf{t}}}}\setminus E_{\mathcal{T}}}
\tilde{p}_e
@@ -270,16 +270,24 @@ in Section~\ref{sec:dag}.
\section{Maximum Likelihood Inference}
-In this section, we assume that we observe a vector of infection times
-$\mathbf{t}$ and we are interested in recovering the parameters $\mathcal{T},
-\beta, \alpha, \lambda$ maximizing the likelihood of our observations as given
-by the model \eqref{eq:model}. That is, we want to solve the following
-optimization problem:
+The probabilistic model \eqref{eq:model} defines the joint probability of
+a pair $\mathbf{t}, \mathcal{T}$ of infection times and cascades. However, we
+are interested in situations where only the infection times are observed.
+
+A direct application of maximum likelihood estimation would recover the
+parameters $\beta, \alpha$ and $\lambda$ by solving the following optimization
+program:
+\begin{displaymath}
+ \max_{\alpha,\beta,\lambda}\sum_{\mathcal{T}}
+ \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda]
+\end{displaymath}
+However, since we are also interested in recovering the infection edges, we
+solve the following optimization program instead:
\begin{equation}
\label{eq:program}
\mathcal{T}^*, \beta^*, \alpha^*, \lambda^* \in
\argmax_{\mathcal{T},\beta,\alpha,\lambda}
- \P[\mathbf{t}\,|\, \mathcal{T}, \beta, \alpha, \lambda]
+ \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda]
\end{equation}
When solving this optimization program, the optimal forest $\mathcal{T}^*$ will
indicate the most likely edges along which the infection could have propagated,
@@ -290,12 +298,12 @@ temporal and structural scale of pairwise infection.
We first assume that $\beta$, $\alpha$ and $\lambda$ are fixed and solve
\eqref{eq:program} for $\mathcal{T}$:
-\begin{displaymath}
+\begin{equation}\label{eq:program2}
\mathcal{T}^*\in\argmax_{\mathcal{T}}
\beta^r(1-\beta)^{n-r}\prod_{e\in E_{\mathcal{T}}} p_e
\prod_{e\in E_{D_{\mathcal{\mathbf{t}}}}\setminus E_{\mathcal{T}}}
\tilde{p}_e
-\end{displaymath}
+\end{equation}
Note that the solution $\mathcal{T}^*$ depends on the value of $(\beta, \alpha,
\lambda)$. We will then optimize over these parameters in the following
sections.
@@ -317,8 +325,73 @@ equivalent optimization program:
+\sum_{e\in E_{\mathcal{T}}} \log\frac{p_e}{\tilde{p}_e}
\end{displaymath}
+As mentioned in Section XXX, $\mathcal{T}$ is a random forest spawning $R$, the
+set of victim nodes, in $D_{\mathbf{t}}$. So choosing $\mathcal{T}$ amounts to
+choosing for each node in $R$:
+\begin{itemize}
+ \item either to make it the root of a new tree, in which case it will
+ contribute a term $\log\frac{\beta}{1-\beta}$ to the objective
+ function.
+ \item either to make it part of an existing tree, by including one of its
+ incoming edge $e$ in the edge set $E_{\mathcal{T}}$. In this case, the
+ node contributes a term $\log\frac{p_e}{\tilde{p}_e}$ to the objective
+ function.
+\end{itemize}
+
+Since our goal is to solve \eqref{eq:program2}, in the second case, it is
+sufficient to consider the incoming edge which maximizes the quantity
+$\frac{p_e}{\tilde{p}_e}$. For this edge $e$, deciding between the two cases
+then amounts to comparing $\frac{p_e}{\tilde{p}_e}$ to $\frac{\beta}{1-\beta}$.
+This leads to a simple thresholding algorithm for solving \eqref{eq:program2}.
+
+\begin{algorithm}
+ \caption{Thresholding algorithm for cascade recovery}
+ \begin{algorithmic}[1]
+ \Require graph $G$, infection times $\mathbf{t}$, $\alpha$, $\beta$, $\lambda$
+ \Ensure cascades $\mathcal{T}$
+ \State $D_{\mathbf{t}}\gets$ acyclic graph obtained by trimming $G$
+ \State $R\gets\{u\in G: t_u < \infty\}$
+ \State $E\gets\emptyset$
+ \For{$v\in R$}
+ \State $u\gets\argmax\left\{\frac{p_{u,v}}{\tilde{p}_{u,v}}: (u,v)\in
+ D_t\right\}$
+ \If{$\frac{p_{u,v}}{\tilde{p}_{u,v}}\geq \frac{\beta}{1-\beta}$}
+ \State $E\gets E\cup \{(u,v)\}$
+ \EndIf
+ \EndFor
+ \State \textbf{return} $\mathcal{T} = (R, E)$
+ \end{algorithmic}
+\end{algorithm}
+
\subsection{Finding the Spawning Parameter}
+In this section, we consider the value of $\alpha$ and $\lambda$ fixed and try
+to optimize \eqref{eq:program} jointly over $\mathcal{T}$ and $\beta$.
+
+For a node $v\in R$, let us define:
+\begin{displaymath}
+ q_v \eqdef \max\left\{\frac{p_{u,v}}{\tilde{p}_{u,v}}:(u,v)\in E_{\mathbf{t}}\right\}
+\end{displaymath}
+and denote by $E_{\beta}$ the set:
+\begin{displaymath}
+ R_{\beta} \eqdef \left\{v\in R:q_v\geq \frac{\beta}{1-\beta}\right\}
+\end{displaymath}
+Then it follows from the analysis of Section XXX, that for a fixed value of
+$\beta$, the optimal value of \eqref{eq:program} when optimizing over
+$\mathcal{T}$ is given by:
+\begin{displaymath}
+ f(\beta) = (|R|-|R_\beta|)\log\frac{\beta}{1-\beta}
+ + n\log(1-\beta)+ \sum_{v\in R_\beta} \log q_v
+\end{displaymath}
+Hence our goal is to find the maximum of $f(\beta)$ over the interval $(0,1)$.
+Note that the maximum exists since $f$ diverges to $-\infty$ as $\beta$
+approaches $0$ or $1$. Furthermore $f$ is continuous over $(0,1)$ since we can
+rewrite it:
+\begin{displaymath}
+ f(\beta) = n\log(1-\beta) + \sum_{v\in R}
+ \log\max\left\{\frac{\beta}{1-\beta}, q_v\right\}
+\end{displaymath}
+
\subsection{Finding the Pairwise Propagation Parameters}
\end{document}