diff options
Diffstat (limited to 'theory/main.tex')
| -rw-r--r-- | theory/main.tex | 91 |
1 files changed, 82 insertions, 9 deletions
diff --git a/theory/main.tex b/theory/main.tex index 1b4f92a..eb59c6e 100644 --- a/theory/main.tex +++ b/theory/main.tex @@ -259,7 +259,7 @@ and a random forest $\mathcal{T} = (T_1,\ldots, T_r)$, we can compute the probability of observing the infection times given by the vector $\textbf{t}$: \begin{equation} \label{eq:model} - \P[\mathbf{t}\,|\, \mathcal{T}, \beta, \alpha, \lambda] + \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda] = \beta^r(1-\beta)^{n-r}\prod_{e\in E_{\mathcal{T}}} p_e \prod_{e\in E_{D_{\mathcal{\mathbf{t}}}}\setminus E_{\mathcal{T}}} \tilde{p}_e @@ -270,16 +270,24 @@ in Section~\ref{sec:dag}. \section{Maximum Likelihood Inference} -In this section, we assume that we observe a vector of infection times -$\mathbf{t}$ and we are interested in recovering the parameters $\mathcal{T}, -\beta, \alpha, \lambda$ maximizing the likelihood of our observations as given -by the model \eqref{eq:model}. That is, we want to solve the following -optimization problem: +The probabilistic model \eqref{eq:model} defines the joint probability of +a pair $\mathbf{t}, \mathcal{T}$ of infection times and cascades. However, we +are interested in situations where only the infection times are observed. + +A direct application of maximum likelihood estimation would recover the +parameters $\beta, \alpha$ and $\lambda$ by solving the following optimization +program: +\begin{displaymath} + \max_{\alpha,\beta,\lambda}\sum_{\mathcal{T}} + \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda] +\end{displaymath} +However, since we are also interested in recovering the infection edges, we +solve the following optimization program instead: \begin{equation} \label{eq:program} \mathcal{T}^*, \beta^*, \alpha^*, \lambda^* \in \argmax_{\mathcal{T},\beta,\alpha,\lambda} - \P[\mathbf{t}\,|\, \mathcal{T}, \beta, \alpha, \lambda] + \P[\mathbf{t}, \mathcal{T}\,|\, \beta, \alpha, \lambda] \end{equation} When solving this optimization program, the optimal forest $\mathcal{T}^*$ will indicate the most likely edges along which the infection could have propagated, @@ -290,12 +298,12 @@ temporal and structural scale of pairwise infection. We first assume that $\beta$, $\alpha$ and $\lambda$ are fixed and solve \eqref{eq:program} for $\mathcal{T}$: -\begin{displaymath} +\begin{equation}\label{eq:program2} \mathcal{T}^*\in\argmax_{\mathcal{T}} \beta^r(1-\beta)^{n-r}\prod_{e\in E_{\mathcal{T}}} p_e \prod_{e\in E_{D_{\mathcal{\mathbf{t}}}}\setminus E_{\mathcal{T}}} \tilde{p}_e -\end{displaymath} +\end{equation} Note that the solution $\mathcal{T}^*$ depends on the value of $(\beta, \alpha, \lambda)$. We will then optimize over these parameters in the following sections. @@ -317,8 +325,73 @@ equivalent optimization program: +\sum_{e\in E_{\mathcal{T}}} \log\frac{p_e}{\tilde{p}_e} \end{displaymath} +As mentioned in Section XXX, $\mathcal{T}$ is a random forest spawning $R$, the +set of victim nodes, in $D_{\mathbf{t}}$. So choosing $\mathcal{T}$ amounts to +choosing for each node in $R$: +\begin{itemize} + \item either to make it the root of a new tree, in which case it will + contribute a term $\log\frac{\beta}{1-\beta}$ to the objective + function. + \item either to make it part of an existing tree, by including one of its + incoming edge $e$ in the edge set $E_{\mathcal{T}}$. In this case, the + node contributes a term $\log\frac{p_e}{\tilde{p}_e}$ to the objective + function. +\end{itemize} + +Since our goal is to solve \eqref{eq:program2}, in the second case, it is +sufficient to consider the incoming edge which maximizes the quantity +$\frac{p_e}{\tilde{p}_e}$. For this edge $e$, deciding between the two cases +then amounts to comparing $\frac{p_e}{\tilde{p}_e}$ to $\frac{\beta}{1-\beta}$. +This leads to a simple thresholding algorithm for solving \eqref{eq:program2}. + +\begin{algorithm} + \caption{Thresholding algorithm for cascade recovery} + \begin{algorithmic}[1] + \Require graph $G$, infection times $\mathbf{t}$, $\alpha$, $\beta$, $\lambda$ + \Ensure cascades $\mathcal{T}$ + \State $D_{\mathbf{t}}\gets$ acyclic graph obtained by trimming $G$ + \State $R\gets\{u\in G: t_u < \infty\}$ + \State $E\gets\emptyset$ + \For{$v\in R$} + \State $u\gets\argmax\left\{\frac{p_{u,v}}{\tilde{p}_{u,v}}: (u,v)\in + D_t\right\}$ + \If{$\frac{p_{u,v}}{\tilde{p}_{u,v}}\geq \frac{\beta}{1-\beta}$} + \State $E\gets E\cup \{(u,v)\}$ + \EndIf + \EndFor + \State \textbf{return} $\mathcal{T} = (R, E)$ + \end{algorithmic} +\end{algorithm} + \subsection{Finding the Spawning Parameter} +In this section, we consider the value of $\alpha$ and $\lambda$ fixed and try +to optimize \eqref{eq:program} jointly over $\mathcal{T}$ and $\beta$. + +For a node $v\in R$, let us define: +\begin{displaymath} + q_v \eqdef \max\left\{\frac{p_{u,v}}{\tilde{p}_{u,v}}:(u,v)\in E_{\mathbf{t}}\right\} +\end{displaymath} +and denote by $E_{\beta}$ the set: +\begin{displaymath} + R_{\beta} \eqdef \left\{v\in R:q_v\geq \frac{\beta}{1-\beta}\right\} +\end{displaymath} +Then it follows from the analysis of Section XXX, that for a fixed value of +$\beta$, the optimal value of \eqref{eq:program} when optimizing over +$\mathcal{T}$ is given by: +\begin{displaymath} + f(\beta) = (|R|-|R_\beta|)\log\frac{\beta}{1-\beta} + + n\log(1-\beta)+ \sum_{v\in R_\beta} \log q_v +\end{displaymath} +Hence our goal is to find the maximum of $f(\beta)$ over the interval $(0,1)$. +Note that the maximum exists since $f$ diverges to $-\infty$ as $\beta$ +approaches $0$ or $1$. Furthermore $f$ is continuous over $(0,1)$ since we can +rewrite it: +\begin{displaymath} + f(\beta) = n\log(1-\beta) + \sum_{v\in R} + \log\max\left\{\frac{\beta}{1-\beta}, q_v\right\} +\end{displaymath} + \subsection{Finding the Pairwise Propagation Parameters} \end{document} |
