1 files changed, 69 insertions, 8 deletions
diff --git a/supplements/main.tex b/supplements/main.tex
index 04a306d..2c73442 100644
--- a/supplements/main.tex
+++ b/supplements/main.tex
@@ -1,6 +1,6 @@
 \documentclass{article}
 \usepackage[utf8x]{inputenc}
-\usepackage{amsmath}
+\usepackage{amsmath, amsthm}
 \usepackage{algorithm}% http://ctan.org/pkg/algorithms
 \usepackage{algpseudocode}% http://ctan.org/pkg/algorithmicx
 \usepackage{graphicx}
@@ -8,8 +8,11 @@
 \usepackage{verbatim}
 
 \DeclareMathOperator*{\argmax}{argmax}
+\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
 \providecommand{\e}[1]{\ensuremath{\times 10^{#1}}}
 \newcommand{\given}{\,|\,}
+\newtheorem{theorem}{Theorem}
+\newtheorem{proposition}[theorem]{Proposition}
 
 \title{Hawkes contagion model}
 \author{Ben Green \and Thibaut Horel \and Andrew Papachristos}
@@ -64,7 +67,7 @@ log-likelihood of events $\mathcal{E} = \{(t_i, k_i)\}_{1\leq i\leq N}$ given
 $M$ and $\Phi$ and observation period $[0, T]$:
 \begin{equation}
     \label{eq:likelihood}
-    \mathcal{L}(\mathcal{E}\given M, \Phi) = \sum_{i=1}^N \log\lambda_{k_i}(t_i)
+    \mathcal{L}(M, \Phi\given\mathcal{E}) = \sum_{i=1}^N \log\lambda_{k_i}(t_i)
     - \sum_{k=1}^D\int_{0}^T \lambda_k(t) dt
 \end{equation}
 
@@ -140,7 +143,7 @@ death in the second summand of Equation~\eqref{eq:likelihood}. Denoting by
 $T_u$ the time of death of vertex $u$ ($T_u = T$ if the individual didn't die
 during the study period), we obtain:
 \begin{displaymath}
-    \mathcal{L}(\mathcal{E}\given \mu, \alpha, \beta) = \sum_{i=1}^N \log\lambda_{u_i}(t_i)
+    \mathcal{L}(\mu, \alpha, \beta\given\mathcal{E}) = \sum_{i=1}^N \log\lambda_{u_i}(t_i)
     - \sum_{v\in V}\int_{0}^{T_v} \lambda_v(t) dt
 \end{displaymath}
 
@@ -149,7 +152,7 @@ be expanded to:
 \begin{equation}
     \label{eq:final-likelihood}
     \begin{split}
-        \mathcal{L}(\mathcal{E}\given \mu, \alpha, \beta) =&
+        \mathcal{L}(\mu, \alpha, \beta\given\mathcal{E}) =&
         \sum_{i=1}^N \log\left(\mu(t_i)
     + \sum_{j:t_j< t_i} g_\alpha(u_i, u_j)\beta e^{-\beta (t_i - t_j)}\right)
     \\
@@ -223,16 +226,74 @@ where $\mu_0 = \frac{A}{|V|}$.
 
 \subsection{Kernel Function Parameters}
 
-We learn parameters using
+Using the exogenous intensity from Section~\ref{sec:background}, the
+log-likelihood now depends on three parameters $(\mu_0, \alpha, \beta)$, and
+finding the maximum likelihood estimate of these parameters amounts to solving
+the following optimization problem:
+\begin{equation}
+    \label{eq:mle}
+    \argmax_{\mu_0,\alpha,\beta}\mathcal{L}(\mu_0,\alpha,\beta\given\mathcal{E})
+\end{equation}
+Unfortunately, the function $\mathcal{L}$ is not jointly concave in its three
+arguments. We will however exploit the following fact.
+
+\begin{proposition}
+    \label{prop:concave} 
+    The function $(\mu_0,\alpha)\mapsto
+    \mathcal{L}(\mu_0,\alpha,\beta\given\mathcal{E})$ is concave.
+\end{proposition}
+
+\begin{proof}
+    Using Equation~\ref{eq:final-likelihood}, it is easy to see that the second
+    and third sums are linear in $(\mu_0, \alpha)$, hence it is sufficient to
+    show that for $1\leq i\leq N$:
+    \begin{displaymath}
+        f(\mu_0,\alpha) = \log\left(
+            \mu(t_i)
+            + \sum_{j:t_j< t_i} g_\alpha(u_i, u_j)\beta e^{-\beta (t_i - t_j)}
+        \right)
+    \end{displaymath}
+    is concave. For this, we see that the operand of the $\log$ function is
+    linear in $(\mu_0, \alpha)$. By composition with $\log$ with the concave
+    function $\log$, we obtain that $f$ is concave and thus conclude the proof.
+\end{proof}
 
-$\mu_0 = 1.1845e-05$, $\alpha = 0.00317$, and $\beta = 0.0039$.
+Numerically, we observed that $\mathcal{L}$ has many local optima, hence we
+solved \eqref{eq:mle} using the following heuristic:
+\begin{itemize}
+    \item first, we did a brute force grid search to locate good starting
+        points for the refining heuristic.
+    \item second, starting from the best gridpoints obtained during the first
+        step, we refined the solution by alternated minimization, that is
+        alternatively:
+        \begin{itemize}
+            \item optimizing over $(\mu_0, \alpha)$ for a fixed value of
+                $\beta$. For this, Proposition~\ref{prop:concave} allows us to
+                use standard convex optimization machinery (gradient descent in
+                our case) to solve this step exactly.
+            \item optimizing over $\beta$ for a fixed value of $(\mu_0,
+                \alpha)$. The simulated annealing was used at this step.
+        \end{itemize}
+\end{itemize}
+Other heuristics were considered: using gradient descent as well for the
+optimization over $\beta$, or using global gradient descent to optimize over
+$(\mu_0,\alpha,\beta)$ at the same time. All heuristics led to the same optimal
+solution, indicating that our initial grid search with fine enough to identify
+good starting points. We obtained the following values of the parameters at
+the optimum:
+\begin{displaymath}
+    \mu_0 = 1.19\cdot 10^{-5},\quad
+    \alpha = 7.82\cdot 10^{-3},\quad
+    \beta =  3.74\cdot 10^{-3}
+\end{displaymath}
 
+\begin{comment}
 \begin{equation}
 \lambda_v(t) = 1.1845\e{-5} \left[1 + 0.43 \sin\left(\frac{2\pi}{365.24} t + 4.36\right) \right] + \sum_{u \in V} \frac{0.00317}{\text{dist}(u,v)} 0.0039 e^{-0.0039(t-t_u)}
 \end{equation}
+\end{comment}
 
-\section{Inferring infections}
-[how we determine background vs peer infection]
+\section{Inferring the Patterns of Infections}
 
 We can estimate if a person was primarily infected via peer contagion by comparing the contributions from the background rate and from his or her peers.