diff options
| -rw-r--r-- | experiments/new.py | 23 | ||||
| -rw-r--r-- | hawkes/cause.py | 24 | ||||
| -rw-r--r-- | hawkes/data.py | 65 | ||||
| -rw-r--r-- | hawkes/main.py | 43 | ||||
| -rw-r--r-- | hawkes/plot.py | 26 | ||||
| -rw-r--r-- | hawkes/refine.py | 43 | ||||
| -rw-r--r-- | hawkes/sanity.py | 5 | ||||
| -rw-r--r-- | theory/warmup.tex | 119 |
8 files changed, 348 insertions, 0 deletions
diff --git a/experiments/new.py b/experiments/new.py new file mode 100644 index 0000000..4274dd7 --- /dev/null +++ b/experiments/new.py @@ -0,0 +1,23 @@ +from math import log, exp + +T = 100 +N = 100 + + +def kernel(t, mu): + return mu * exp(-mu * t) + + +def base_rate(t, lamb): + return lamb + + +def ll(crimes, weights, mu, lamb): + r = 0 + for i, crime in enumerate(crimes): + t, v = crime + a = sum(weights[(u, v)] * kernel(t - s, mu) for s, u in crimes[:t]) + r += log(base_rate(t, lamb) + a) + for j in range(N): + a = sum(weights[(u, v)] * kernel(T - s, mu) for s, u in crimes) + r -= log(a) diff --git a/hawkes/cause.py b/hawkes/cause.py new file mode 100644 index 0000000..0cce9d2 --- /dev/null +++ b/hawkes/cause.py @@ -0,0 +1,24 @@ +from cPickle import load +from math import exp + + +def main(a): + lamb, alpha, mu = a + dr = 0 + r = 0 + for ((n1, t1), s) in event_edges.iteritems(): + if not s: + dr += 1 + if lamb > sum(alpha * w * mu * exp(-mu * (t1 - t2)) + for (n2, t2, w) in s): + r += 1 + return lamb, alpha, mu, dr, r + + +if __name__ == "__main__": + nodes, edges, events, event_edges = load(open("data.pickle", "rb")) + for i, line in enumerate(open("values-sorted.txt")): + if i > 100: + break + lamb, alpha, mu, v = map(float, line.strip().split()) + print main((lamb, alpha, mu)) diff --git a/hawkes/data.py b/hawkes/data.py new file mode 100644 index 0000000..4d7744e --- /dev/null +++ b/hawkes/data.py @@ -0,0 +1,65 @@ +from csv import DictReader +import sys +from itertools import product +from cPickle import dump + +MAX_TIME = 3012 + + +def parse(s): + return None if s == "NA" else int(float(s)) + + +def load_nodes(filename): + with open(filename) as fh: + reader = DictReader(fh) + d = {parse(row["name"]): parse(row["fatal_day"]) for row in reader} + for n, t in d.iteritems(): + if t is None: + d[n] = MAX_TIME + return d + + +def load_edges(filename): + events = {} + edges = {} + with open(filename) as fh: + reader = DictReader(fh) + for row in reader: + fro, to, t, weight = map(parse, [row["from"], row["to"], + row["t1"], row["w1"]]) + d = edges.get(fro, dict()) + d[to] = weight + edges[fro] = d + s = events.get(fro, set()) + s.add(t) + events[fro] = s + return edges, events + + +def compute_event_edges(events, edges): + event_edges = {} + + for fro in events: + for t in events[fro]: + event_edges[(fro, t)] = set() + + for fro in edges: + for to in edges[fro]: + try: + e1, e2 = events[fro], events[to] + except KeyError: + continue + for t1, t2 in product(e1, e2): + if t1 < t2: + s = event_edges[(to, t2)] + s.add((fro, t1, edges[fro][to])) + event_edges[(to, t2)] = s + return event_edges + + +if __name__ == "__main__": + nodes = load_nodes(sys.argv[1]) + edges, events = load_edges(sys.argv[2]) + event_edges = compute_event_edges(events, edges) + dump((nodes, edges, events, event_edges), open("data.pickle", "wb")) diff --git a/hawkes/main.py b/hawkes/main.py new file mode 100644 index 0000000..65586ad --- /dev/null +++ b/hawkes/main.py @@ -0,0 +1,43 @@ +from cPickle import load +from math import log, exp +#import numpy as np +#from scipy.optimize import basinhopping +from itertools import product + + +def iter_events(events): + for n, s in events.iteritems(): + for t in s: + yield (n, t) + + +def ll(a): + lamb, alpha, mu = a + + r1 = sum(log(lamb + sum(alpha * w * mu * exp(-mu * (t1 - t2)) + for (n2, t2, w) in s)) + for ((n1, t1), s) in event_edges.iteritems()) + r2 = sum(sum(alpha * w * (1 - exp(-mu * (nodes[n2] - t1))) + for n2, w in edges[n1].iteritems() if nodes[n2] > t1) + for (n1, t1) in iter_events(events)) + r3 = lamb * sum(nodes.itervalues()) + return -(r1 - r2 - r3) + + +if __name__ == "__main__": + nodes, edges, events, event_edges = load(open("data.pickle", "rb")) + d = {} + for line in open("values.txt"): + v = map(float, line.strip().split()) + d[tuple(v[:3])] = v[3] + + # lamb = [20. ** i for i in range(-15, 15)] + # alpha = [20. ** i for i in range(-15, 15)] + # mu = [20. ** i for i in range(-15, 15)] + # for l, a, m in product(lamb, alpha, mu): + # if (l, a, m) in d: + # continue + # print l, a, m, ll((l, a, m)) + print ll((2, 10000000000., 0.000000000000000000001)) + #r = basinhopping(ll, init, disp=True, stepsize=0.1, T=10000., niter=500) + #print r.x diff --git a/hawkes/plot.py b/hawkes/plot.py new file mode 100644 index 0000000..24d3f35 --- /dev/null +++ b/hawkes/plot.py @@ -0,0 +1,26 @@ +from mpl_toolkits.mplot3d import Axes3D +from matplotlib import cm +import matplotlib.pyplot as plt + +fig = plt.figure() +ax = fig.gca(projection='3d') +d = {} +for line in open("values.txt"): + v = map(float, line.strip().split()) + try: + if (v[0], v[2]) in d: + d[(v[0], v[2])] = min(d[(v[0], v[2])], v[3]) + else: + d[(v[0], v[2])] = v[3] + except: + continue + +x, y, z = [], [], [] +for k, v in d.iteritems(): + x.append(k[0] / 1000000.) + y.append(k[1] / 1000000.) + z.append(v / 100000.) + +print x +surf = ax.plot_trisurf(x, y, z) +plt.show() diff --git a/hawkes/refine.py b/hawkes/refine.py new file mode 100644 index 0000000..81fd547 --- /dev/null +++ b/hawkes/refine.py @@ -0,0 +1,43 @@ +from cPickle import load +from itertools import product +from math import exp, log + + +def iter_events(events): + for n, s in events.iteritems(): + for t in s: + yield (n, t) + + +def inprod(a, b): + return tuple(float(x * y) for x, y in zip(a, b)) + + +def ll(a): + lamb, alpha, mu = a + + r1 = sum(log(lamb + sum(alpha * w * mu * exp(-mu * (t1 - t2)) + for (n2, t2, w) in s)) + for ((n1, t1), s) in event_edges.iteritems()) + r2 = sum(sum(alpha * w * (1 - exp(-mu * (nodes[n2] - t1))) + for n2, w in edges[n1].iteritems() if nodes[n2] > t1) + for (n1, t1) in iter_events(events)) + r3 = lamb * sum(nodes.itervalues()) + return -(r1 - r2 - r3) + + +if __name__ == "__main__": + nodes, edges, events, event_edges = load(open("data.pickle", "rb")) + d = {} + for line in open("values.txt"): + v = map(float, line.strip().split()) + d[tuple(v[:3])] = v[3] + l = d.items() + l.sort(key=lambda x: x[1]) + for a, _ in l[:10]: + t = [1. / i for i in range(2, 6)] + [float(i) for i in range(1, 6)] + for b in product(t, repeat=3): + l, al, m = inprod(a, b) + if (l, al, m) in d: + continue + print l, al, m, ll((l, al, m)) diff --git a/hawkes/sanity.py b/hawkes/sanity.py new file mode 100644 index 0000000..c837757 --- /dev/null +++ b/hawkes/sanity.py @@ -0,0 +1,5 @@ +from cPickle import load + +if __name__ == "__main__": + nodes, edges, events, event_edges = load(open("data.pickle", "rb")) + print len(nodes) diff --git a/theory/warmup.tex b/theory/warmup.tex new file mode 100644 index 0000000..c34eb89 --- /dev/null +++ b/theory/warmup.tex @@ -0,0 +1,119 @@ +\documentclass[10pt]{article} +\usepackage[utf8x]{inputenc} +\usepackage{amsmath,amsfonts,amsthm} +\usepackage[english]{babel} +\usepackage[capitalize, noabbrev]{cleveref} +\usepackage{algorithm} +\usepackage{algpseudocode} + +\newenvironment{enumerate*}% + {\vspace{-2ex} \begin{enumerate} % + \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% + {\end{enumerate}} + +\newenvironment{itemize*}% + {\vspace{-2ex} \begin{itemize} % + \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% + {\end{itemize}} + +\newenvironment{description*}% + {\vspace{-2ex} \begin{description} % + \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% + {\end{description}} + +\DeclareMathOperator{\E}{\mathbb{E}} +\let\P\relax +\DeclareMathOperator{\P}{\mathbb{P}} +\newcommand{\ex}[1]{\E\left[#1\right]} +\newcommand{\prob}[1]{\P\left[#1\right]} +\newcommand{\inprod}[1]{\left\langle #1 \right\rangle} +\newcommand{\R}{\mathbf{R}} +\newcommand{\N}{\mathbf{N}} +\newcommand{\C}{\mathcal{C}} +\newcommand{\eps}{\varepsilon} +\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}} +\newcommand{\llbracket}{[\![} + +\newtheorem{theorem}{Theorem} +\newtheorem{lemma}{Lemma} +\algrenewcommand\algorithmicensure{\textbf{Output:}} +\algrenewcommand\algorithmicrequire{\textbf{Input:}} +\DeclareMathOperator*{\argmax}{arg\,max} + +\author{} +\title{} +\date{} + +\begin{document} + +\paragraph{Notations} +\begin{itemize} + \item $n$ criminals, $V = \{1,\dots,n\}$ + \item $\alpha_{i,j}$ influence of criminal $i$ on criminal $j$. + \item the infections form a set of pairs $C = \{(t_i, v_i),\; 1\leq i\leq m\}$ + where $t_i$ is the time and $v_i\in\{1,\dots, n\}$ is the identifier of + the criminal. +\end{itemize} + +\paragraph{Model} +The infections are modeled as a multivariate Hawkes process. The instantaneous rate +of infection of user $j$ at time $t$ is given by: +\begin{displaymath} + \lambda_j(t) = \lambda(t) + \sum_{t_i<t}\alpha_{v_i, j} + \phi(t-t_i) +\end{displaymath} + +\emph{Technical concern:} stability of the process, there is a condition on the +$\alpha_{i,j}$ parameters so that the process is stable, i.e does not keep +accelerating forever. We probably do not need to worry about it too much: +empirically the process is stable. + +\paragraph{Time kernel} +we choose an exponential kernel for $\phi$ (will allow the recursive trick +later on) $ \phi(t) = \mu e^{-\mu t} $ + +\paragraph{Base rate} $\lambda(t)$ captures the seasonal effect. TBD, worst +case is to choose a constant approximation, $\lambda(t) = \lambda_0$. We +define $\Lambda(T) \eqdef \int_{0}^T \lambda(t)dt$. + +\paragraph{Edge model} TBD + +\paragraph{Likelihood} The log-likelihood for observation horizon $T$ is given +by: +\begin{displaymath} + \mathcal{L}(C) = \sum_{i=1}^m \log \lambda_{v_i}(t_i) + - \sum_{j=1}^n\int_{0}^T \lambda_j(t)dt +\end{displaymath} +Note that we have: +\begin{displaymath} + \int_{0}^T \lambda_j(t)dt = \int_{0}^T\lambda(t)dt + \sum_{i=1}^m + \alpha_{v_i, j}\big[1-e^{-\mu(T-t_i)}\big] +\end{displaymath} + +Hence the log-likelihood can be rewritten (after reordering a little bit): +\begin{displaymath} + \mathcal{L}(C) = \sum_{i=1}^m\log \lambda_{v_i}(t_i) + - \sum_{i=1}^m A_{v_i} \big[1-e^{-\mu(T-t_i)}\big] + - n\Lambda(T) +\end{displaymath} +where $A_{v_i} = \sum_{j=1}^n \alpha_{v_i, j}$ (out-weight of criminal $v_i$), +or in complete form: +\begin{displaymath} + \mathcal{L}(C) = \sum_{i=1}^m\log\left[ \lambda(t_i) + + \sum_{j=1}^{i-1}\alpha_{v_j, v_i}\mu e^{-\mu(t_i-t_j)}\right] + - \sum_{i=1}^m A_{v_i} \big[1-e^{-\mu(T-t_i)}\big] + - n\Lambda(T) +\end{displaymath} + +\paragraph{Computational trick} As written above, the log-likelihood can be +computed in time $O(m^2 + n^2)$. This is not linear in the input size, which is +$O(m + n^2)$. The $O(m^2)$ term comes from the first sum in the definition of +the log-likelihood. However, in the case of an exponential time kernel, using +a recursive trick, the complexity of this term can be reduced to $O(mn)$ for an +overall running time of $O(mn + n^2)$. + +\emph{Question:} Is it worth it in our case? Yes if the number of victims +is significantly smaller than the number of infections. Is it the case? + + +\end{document} |
