summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--experiments/new.py23
-rw-r--r--hawkes/cause.py24
-rw-r--r--hawkes/data.py65
-rw-r--r--hawkes/main.py43
-rw-r--r--hawkes/plot.py26
-rw-r--r--hawkes/refine.py43
-rw-r--r--hawkes/sanity.py5
-rw-r--r--theory/warmup.tex119
8 files changed, 348 insertions, 0 deletions
diff --git a/experiments/new.py b/experiments/new.py
new file mode 100644
index 0000000..4274dd7
--- /dev/null
+++ b/experiments/new.py
@@ -0,0 +1,23 @@
+from math import log, exp
+
+T = 100
+N = 100
+
+
+def kernel(t, mu):
+ return mu * exp(-mu * t)
+
+
+def base_rate(t, lamb):
+ return lamb
+
+
+def ll(crimes, weights, mu, lamb):
+ r = 0
+ for i, crime in enumerate(crimes):
+ t, v = crime
+ a = sum(weights[(u, v)] * kernel(t - s, mu) for s, u in crimes[:t])
+ r += log(base_rate(t, lamb) + a)
+ for j in range(N):
+ a = sum(weights[(u, v)] * kernel(T - s, mu) for s, u in crimes)
+ r -= log(a)
diff --git a/hawkes/cause.py b/hawkes/cause.py
new file mode 100644
index 0000000..0cce9d2
--- /dev/null
+++ b/hawkes/cause.py
@@ -0,0 +1,24 @@
+from cPickle import load
+from math import exp
+
+
+def main(a):
+ lamb, alpha, mu = a
+ dr = 0
+ r = 0
+ for ((n1, t1), s) in event_edges.iteritems():
+ if not s:
+ dr += 1
+ if lamb > sum(alpha * w * mu * exp(-mu * (t1 - t2))
+ for (n2, t2, w) in s):
+ r += 1
+ return lamb, alpha, mu, dr, r
+
+
+if __name__ == "__main__":
+ nodes, edges, events, event_edges = load(open("data.pickle", "rb"))
+ for i, line in enumerate(open("values-sorted.txt")):
+ if i > 100:
+ break
+ lamb, alpha, mu, v = map(float, line.strip().split())
+ print main((lamb, alpha, mu))
diff --git a/hawkes/data.py b/hawkes/data.py
new file mode 100644
index 0000000..4d7744e
--- /dev/null
+++ b/hawkes/data.py
@@ -0,0 +1,65 @@
+from csv import DictReader
+import sys
+from itertools import product
+from cPickle import dump
+
+MAX_TIME = 3012
+
+
+def parse(s):
+ return None if s == "NA" else int(float(s))
+
+
+def load_nodes(filename):
+ with open(filename) as fh:
+ reader = DictReader(fh)
+ d = {parse(row["name"]): parse(row["fatal_day"]) for row in reader}
+ for n, t in d.iteritems():
+ if t is None:
+ d[n] = MAX_TIME
+ return d
+
+
+def load_edges(filename):
+ events = {}
+ edges = {}
+ with open(filename) as fh:
+ reader = DictReader(fh)
+ for row in reader:
+ fro, to, t, weight = map(parse, [row["from"], row["to"],
+ row["t1"], row["w1"]])
+ d = edges.get(fro, dict())
+ d[to] = weight
+ edges[fro] = d
+ s = events.get(fro, set())
+ s.add(t)
+ events[fro] = s
+ return edges, events
+
+
+def compute_event_edges(events, edges):
+ event_edges = {}
+
+ for fro in events:
+ for t in events[fro]:
+ event_edges[(fro, t)] = set()
+
+ for fro in edges:
+ for to in edges[fro]:
+ try:
+ e1, e2 = events[fro], events[to]
+ except KeyError:
+ continue
+ for t1, t2 in product(e1, e2):
+ if t1 < t2:
+ s = event_edges[(to, t2)]
+ s.add((fro, t1, edges[fro][to]))
+ event_edges[(to, t2)] = s
+ return event_edges
+
+
+if __name__ == "__main__":
+ nodes = load_nodes(sys.argv[1])
+ edges, events = load_edges(sys.argv[2])
+ event_edges = compute_event_edges(events, edges)
+ dump((nodes, edges, events, event_edges), open("data.pickle", "wb"))
diff --git a/hawkes/main.py b/hawkes/main.py
new file mode 100644
index 0000000..65586ad
--- /dev/null
+++ b/hawkes/main.py
@@ -0,0 +1,43 @@
+from cPickle import load
+from math import log, exp
+#import numpy as np
+#from scipy.optimize import basinhopping
+from itertools import product
+
+
+def iter_events(events):
+ for n, s in events.iteritems():
+ for t in s:
+ yield (n, t)
+
+
+def ll(a):
+ lamb, alpha, mu = a
+
+ r1 = sum(log(lamb + sum(alpha * w * mu * exp(-mu * (t1 - t2))
+ for (n2, t2, w) in s))
+ for ((n1, t1), s) in event_edges.iteritems())
+ r2 = sum(sum(alpha * w * (1 - exp(-mu * (nodes[n2] - t1)))
+ for n2, w in edges[n1].iteritems() if nodes[n2] > t1)
+ for (n1, t1) in iter_events(events))
+ r3 = lamb * sum(nodes.itervalues())
+ return -(r1 - r2 - r3)
+
+
+if __name__ == "__main__":
+ nodes, edges, events, event_edges = load(open("data.pickle", "rb"))
+ d = {}
+ for line in open("values.txt"):
+ v = map(float, line.strip().split())
+ d[tuple(v[:3])] = v[3]
+
+ # lamb = [20. ** i for i in range(-15, 15)]
+ # alpha = [20. ** i for i in range(-15, 15)]
+ # mu = [20. ** i for i in range(-15, 15)]
+ # for l, a, m in product(lamb, alpha, mu):
+ # if (l, a, m) in d:
+ # continue
+ # print l, a, m, ll((l, a, m))
+ print ll((2, 10000000000., 0.000000000000000000001))
+ #r = basinhopping(ll, init, disp=True, stepsize=0.1, T=10000., niter=500)
+ #print r.x
diff --git a/hawkes/plot.py b/hawkes/plot.py
new file mode 100644
index 0000000..24d3f35
--- /dev/null
+++ b/hawkes/plot.py
@@ -0,0 +1,26 @@
+from mpl_toolkits.mplot3d import Axes3D
+from matplotlib import cm
+import matplotlib.pyplot as plt
+
+fig = plt.figure()
+ax = fig.gca(projection='3d')
+d = {}
+for line in open("values.txt"):
+ v = map(float, line.strip().split())
+ try:
+ if (v[0], v[2]) in d:
+ d[(v[0], v[2])] = min(d[(v[0], v[2])], v[3])
+ else:
+ d[(v[0], v[2])] = v[3]
+ except:
+ continue
+
+x, y, z = [], [], []
+for k, v in d.iteritems():
+ x.append(k[0] / 1000000.)
+ y.append(k[1] / 1000000.)
+ z.append(v / 100000.)
+
+print x
+surf = ax.plot_trisurf(x, y, z)
+plt.show()
diff --git a/hawkes/refine.py b/hawkes/refine.py
new file mode 100644
index 0000000..81fd547
--- /dev/null
+++ b/hawkes/refine.py
@@ -0,0 +1,43 @@
+from cPickle import load
+from itertools import product
+from math import exp, log
+
+
+def iter_events(events):
+ for n, s in events.iteritems():
+ for t in s:
+ yield (n, t)
+
+
+def inprod(a, b):
+ return tuple(float(x * y) for x, y in zip(a, b))
+
+
+def ll(a):
+ lamb, alpha, mu = a
+
+ r1 = sum(log(lamb + sum(alpha * w * mu * exp(-mu * (t1 - t2))
+ for (n2, t2, w) in s))
+ for ((n1, t1), s) in event_edges.iteritems())
+ r2 = sum(sum(alpha * w * (1 - exp(-mu * (nodes[n2] - t1)))
+ for n2, w in edges[n1].iteritems() if nodes[n2] > t1)
+ for (n1, t1) in iter_events(events))
+ r3 = lamb * sum(nodes.itervalues())
+ return -(r1 - r2 - r3)
+
+
+if __name__ == "__main__":
+ nodes, edges, events, event_edges = load(open("data.pickle", "rb"))
+ d = {}
+ for line in open("values.txt"):
+ v = map(float, line.strip().split())
+ d[tuple(v[:3])] = v[3]
+ l = d.items()
+ l.sort(key=lambda x: x[1])
+ for a, _ in l[:10]:
+ t = [1. / i for i in range(2, 6)] + [float(i) for i in range(1, 6)]
+ for b in product(t, repeat=3):
+ l, al, m = inprod(a, b)
+ if (l, al, m) in d:
+ continue
+ print l, al, m, ll((l, al, m))
diff --git a/hawkes/sanity.py b/hawkes/sanity.py
new file mode 100644
index 0000000..c837757
--- /dev/null
+++ b/hawkes/sanity.py
@@ -0,0 +1,5 @@
+from cPickle import load
+
+if __name__ == "__main__":
+ nodes, edges, events, event_edges = load(open("data.pickle", "rb"))
+ print len(nodes)
diff --git a/theory/warmup.tex b/theory/warmup.tex
new file mode 100644
index 0000000..c34eb89
--- /dev/null
+++ b/theory/warmup.tex
@@ -0,0 +1,119 @@
+\documentclass[10pt]{article}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amsfonts,amsthm}
+\usepackage[english]{babel}
+\usepackage[capitalize, noabbrev]{cleveref}
+\usepackage{algorithm}
+\usepackage{algpseudocode}
+
+\newenvironment{enumerate*}%
+ {\vspace{-2ex} \begin{enumerate} %
+ \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+ {\end{enumerate}}
+
+\newenvironment{itemize*}%
+ {\vspace{-2ex} \begin{itemize} %
+ \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+ {\end{itemize}}
+
+\newenvironment{description*}%
+ {\vspace{-2ex} \begin{description} %
+ \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+ {\end{description}}
+
+\DeclareMathOperator{\E}{\mathbb{E}}
+\let\P\relax
+\DeclareMathOperator{\P}{\mathbb{P}}
+\newcommand{\ex}[1]{\E\left[#1\right]}
+\newcommand{\prob}[1]{\P\left[#1\right]}
+\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
+\newcommand{\R}{\mathbf{R}}
+\newcommand{\N}{\mathbf{N}}
+\newcommand{\C}{\mathcal{C}}
+\newcommand{\eps}{\varepsilon}
+\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
+\newcommand{\llbracket}{[\![}
+
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}{Lemma}
+\algrenewcommand\algorithmicensure{\textbf{Output:}}
+\algrenewcommand\algorithmicrequire{\textbf{Input:}}
+\DeclareMathOperator*{\argmax}{arg\,max}
+
+\author{}
+\title{}
+\date{}
+
+\begin{document}
+
+\paragraph{Notations}
+\begin{itemize}
+ \item $n$ criminals, $V = \{1,\dots,n\}$
+ \item $\alpha_{i,j}$ influence of criminal $i$ on criminal $j$.
+ \item the infections form a set of pairs $C = \{(t_i, v_i),\; 1\leq i\leq m\}$
+ where $t_i$ is the time and $v_i\in\{1,\dots, n\}$ is the identifier of
+ the criminal.
+\end{itemize}
+
+\paragraph{Model}
+The infections are modeled as a multivariate Hawkes process. The instantaneous rate
+of infection of user $j$ at time $t$ is given by:
+\begin{displaymath}
+ \lambda_j(t) = \lambda(t) + \sum_{t_i<t}\alpha_{v_i, j}
+ \phi(t-t_i)
+\end{displaymath}
+
+\emph{Technical concern:} stability of the process, there is a condition on the
+$\alpha_{i,j}$ parameters so that the process is stable, i.e does not keep
+accelerating forever. We probably do not need to worry about it too much:
+empirically the process is stable.
+
+\paragraph{Time kernel}
+we choose an exponential kernel for $\phi$ (will allow the recursive trick
+later on) $ \phi(t) = \mu e^{-\mu t} $
+
+\paragraph{Base rate} $\lambda(t)$ captures the seasonal effect. TBD, worst
+case is to choose a constant approximation, $\lambda(t) = \lambda_0$. We
+define $\Lambda(T) \eqdef \int_{0}^T \lambda(t)dt$.
+
+\paragraph{Edge model} TBD
+
+\paragraph{Likelihood} The log-likelihood for observation horizon $T$ is given
+by:
+\begin{displaymath}
+ \mathcal{L}(C) = \sum_{i=1}^m \log \lambda_{v_i}(t_i)
+ - \sum_{j=1}^n\int_{0}^T \lambda_j(t)dt
+\end{displaymath}
+Note that we have:
+\begin{displaymath}
+ \int_{0}^T \lambda_j(t)dt = \int_{0}^T\lambda(t)dt + \sum_{i=1}^m
+ \alpha_{v_i, j}\big[1-e^{-\mu(T-t_i)}\big]
+\end{displaymath}
+
+Hence the log-likelihood can be rewritten (after reordering a little bit):
+\begin{displaymath}
+ \mathcal{L}(C) = \sum_{i=1}^m\log \lambda_{v_i}(t_i)
+ - \sum_{i=1}^m A_{v_i} \big[1-e^{-\mu(T-t_i)}\big]
+ - n\Lambda(T)
+\end{displaymath}
+where $A_{v_i} = \sum_{j=1}^n \alpha_{v_i, j}$ (out-weight of criminal $v_i$),
+or in complete form:
+\begin{displaymath}
+ \mathcal{L}(C) = \sum_{i=1}^m\log\left[ \lambda(t_i)
+ + \sum_{j=1}^{i-1}\alpha_{v_j, v_i}\mu e^{-\mu(t_i-t_j)}\right]
+ - \sum_{i=1}^m A_{v_i} \big[1-e^{-\mu(T-t_i)}\big]
+ - n\Lambda(T)
+\end{displaymath}
+
+\paragraph{Computational trick} As written above, the log-likelihood can be
+computed in time $O(m^2 + n^2)$. This is not linear in the input size, which is
+$O(m + n^2)$. The $O(m^2)$ term comes from the first sum in the definition of
+the log-likelihood. However, in the case of an exponential time kernel, using
+a recursive trick, the complexity of this term can be reduced to $O(mn)$ for an
+overall running time of $O(mn + n^2)$.
+
+\emph{Question:} Is it worth it in our case? Yes if the number of victims
+is significantly smaller than the number of infections. Is it the case?
+
+
+\end{document}