6 files changed, 188 insertions, 61 deletions
diff --git a/experiments/Makefile b/experiments/Makefile
index 991c178..0fd956a 100644
--- a/experiments/Makefile
+++ b/experiments/Makefile
@@ -1,6 +1,6 @@
 all:
 	cython -a ml.pyx
-	gcc -pthread -fno-strict-aliasing -march=x86-64 -mtune=generic -O3 -pipe -fstack-protector-strong --param=ssp-buffer-size=4 -DNDEBUG -march=x86-64 -mtune=generic -O3 -pipe -fstack-protector-strong --param=ssp-buffer-size=4 -fPIC -I/usr/include/python2.7 -c ml.c -o ml.o
+	gcc -pthread -fno-strict-aliasing -march=x86-64 -mtune=generic -O3 -pipe -fstack-protector-strong --param=ssp-buffer-size=4 -DNDEBUG -march=x86-64 -mtune=generic -O3 -pipe -fstack-protector-strong --param=ssp-buffer-size=4 -fPIC -I/usr/include/python2.7   -c ml.c -o ml.o
 	gcc -pthread -shared -Wl,-O3,--sort-common,--as-needed,-z,relro ml.o -L/usr/lib -lpython2.7 -o ml.so
 
 
diff --git a/experiments/README.txt b/experiments/README.txt
new file mode 100644
index 0000000..13e0d3d
--- /dev/null
+++ b/experiments/README.txt
@@ -0,0 +1,40 @@
+Description of the files
+========================
+
+* build_network.py: takes a .csv file containing a criminal network and builds
+  and extract a compressed binary representation tailored to make the other
+  scripts faster.
+
+  Takes one argument, the name of the .csv file. The output file has the same
+  name but the extension .pickle
+
+* ml.pyx: code to compute the likelihood of a given alpha (time component) and
+  delta (structural component) for a given infection graph. The optimization of
+  the beta parameter (probability of being a root) is done internally.
+
+* process.py: simple loop to do an exhaustive search over a range of alpha
+  (time component) and delta (structural component). The likelihood (and a few
+  other things) for each of the values is printed in the file "out.log".
+
+  Takes on argument the name of .pickle file computed by build_network.py
+
+* plot3d.py: code to obtain a 3d plot of the log likelihood as a function of
+  alpha and delta. Can also be easily modified to obtain 2d plots along
+  specific axes
+
+Installation
+============
+
+* most of the code uses standard python + numpy + matplotlib
+
+* the file ml.pyx is a Cython file which must be compiled in order to be used.
+At a high level, ml.pyx is compiled into ml.so. The line "import ml" in
+"process.py" will fail if the file ml.so is not present in the directory.
+Important: do not forget to recompile ml.pyx after changing it, otherwise you
+will be working with an outdated version of ml.so!
+
+How to compile? A Makefile is provided. If using make on MacOS is too
+complicated, it is also possible to compile .pyx files using python distutils
+(which I would assume is more standard on MacOS). This only requires distutils
+and a simple setup.pu file. More details here:
+http://docs.cython.org/src/reference/compilation.html#configuring-the-c-build
diff --git a/experiments/build_network.py b/experiments/build_network.py
new file mode 100644
index 0000000..aae90db
--- /dev/null
+++ b/experiments/build_network.py
@@ -0,0 +1,52 @@
+from csv import DictReader
+import sys
+from cPickle import dump
+from os.path import splitext
+
+
+def build_network(filename):
+    victims = {}
+    non_victims = {}
+    age = 0.
+    with open(filename) as fh:
+        reader = DictReader(fh)
+        for row in reader:
+            from_, to = int(float(row["from"])), int(float(row["to"]))
+            if int(float(row["dist"])) > 2:
+                continue
+            if row["t2"] != "NA":
+                dt = int(row["t2"]) - int(row["t1"])
+                parent = (int(row["dist"]), dt)
+                if to not in victims:
+                    age += int(row["t2"])
+                    victims[to] = []
+                victims[to].append(parent)
+                if from_ not in victims:
+                    age += int(row["t1"])
+                    victims[from_] = []
+            else:
+                from_, to = int(float(row["from"])), int(float(row["to"]))
+                parent = (int(row["dist"]), 3012 - int(row["t1"]))
+                if to not in victims:
+                    age += 3012
+                    non_victims[to] = []
+                non_victims[to].append(parent)
+                if from_ not in victims:
+                    age += int(row["t1"])
+                    victims[from_] = []
+    root_victims = {}
+    for victim in victims.keys():
+        if not victims[victim]:
+            del victims[victim]
+            root_victims[victim] = []
+    return root_victims, victims, non_victims, age
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        sys.exit("usage: {0} <file>".format(sys.argv[0]))
+
+    filename = sys.argv[1]
+    root, _ = splitext(filename)
+    root_victims, victims, non_victims, age = build_network(filename)
+    dump((root_victims, victims, non_victims, age), open(root + ".pickle", "w"))
diff --git a/experiments/ml.pyx b/experiments/ml.pyx
index 48d4549..74e5be3 100644
--- a/experiments/ml.pyx
+++ b/experiments/ml.pyx
@@ -6,63 +6,101 @@ from libc.math cimport log, exp
 DTYPE = np.float64
 ctypedef np.float_t DTYPE_t
 
+
 cdef DTYPE_t weight_success(int dist, int dt, DTYPE_t alpha,
                            DTYPE_t delta, DTYPE_t gamma):
+    """weight for successful infection, exponential time model"""
     cdef DTYPE_t structural, temporal, result
-    structural = delta ** dist
+    structural = delta ** (dist)
     temporal = exp(-alpha * dt) * (1 - exp(-alpha))
-    result = structural * temporal
+    result = log(structural * temporal)
+    return result
+
+
+cdef DTYPE_t weight_success_power(int dist, int dt, DTYPE_t alpha,
+                           DTYPE_t delta, DTYPE_t gamma):
+    """weight for successful infection, power-law time model"""
+    cdef DTYPE_t structural, temporal, result
+    structural = delta ** (dist)
+    temporal = 1. / (1. + (dt - 1.)/alpha)**0.01 - 1. / (1. + dt/alpha)**0.01
+    result = log(structural * temporal)
     return result
 
 
 cdef DTYPE_t weight_failure(int dist, int dt, DTYPE_t alpha,
                                DTYPE_t delta, DTYPE_t gamma):
+    """weight for failed infection, exponential time model"""
     cdef DTYPE_t structural, temporal, result
-    structural = delta ** dist
+    structural = delta ** (dist)
     temporal = 1. - exp(-alpha * dt)
-    result = 1. - structural * temporal
+    #result = log(1. - structural)
+    result = log(1. - structural * temporal)
     return result
 
 
-def ml(dict root_victims, dict victims, dict non_victims,
+cdef DTYPE_t weight_failure_power(int dist, int dt, DTYPE_t alpha,
+                               DTYPE_t delta, DTYPE_t gamma):
+    """weight for failed infection, power-law time model"""
+    cdef DTYPE_t structural, temporal, result
+    structural = delta ** (dist)
+    temporal = 1.  - 1. / (1. + dt/alpha)**0.01
+    result = log(1. - structural * temporal)
+    return result
+
+def ml(dict root_victims, dict victims, dict non_victims, DTYPE_t age,
        DTYPE_t alpha, DTYPE_t delta, DTYPE_t gamma=10):
     cdef:
-        int n_roots, n_victims, n_nodes, roots, i, dist, dt, t
-        DTYPE_t beta, all_failures
+        int n_roots, n_victims, n_nodes, roots, i, dist, dt, t, l
+        DTYPE_t beta, all_failures, ll, beta2
         list parents, failures, successes
     n_roots, n_victims = len(root_victims), len(victims)
-    n_nodes = n_victims + len(non_victims)
+    n_nodes = n_victims + len(non_victims) + n_roots
     cdef:
         np.ndarray[DTYPE_t] probs = np.zeros(n_victims, dtype=DTYPE)
         np.ndarray[DTYPE_t] probs_fail = np.zeros(n_victims, dtype=DTYPE)
         np.ndarray[DTYPE_t] probs_nv = np.zeros(len(non_victims), dtype=DTYPE)
     for i, parents in enumerate(victims.itervalues()):
-        failures = [log(weight_failure(dist, dt, alpha, delta, gamma))
-                           for (dist, dt) in parents]
+        # for each victim node i, compute the probability that all its parents
+        # fail to infect it, also computes the probability that its most
+        # likely parent infects it
+        failures = [weight_failure(dist, dt, alpha, delta, gamma)
+                    for (dist, dt) in parents]
         all_failures = sum(failures)
-        successes = [log(weight_success(dist, dt, alpha, delta, gamma))
-                           for (dist, dt) in parents]
-        probs[i] = max(s - failures[i] for i, s in enumerate(successes))
+        successes = [weight_success(dist, dt, alpha, delta, gamma)
+                     for (dist, dt) in parents]
+        probs[i] = max(s - failures[l] for l, s in enumerate(successes))
         probs_fail[i] = all_failures
 
     for i, parents in enumerate(non_victims.itervalues()):
-        failures = [log(weight_failure(dist, dt, alpha, delta, gamma))
-                           for (dist, dt) in parents]
+        # for each non victim node, compute the probability that all its
+        # parents fail to infect it
+        failures = [weight_failure(dist, dt, alpha, delta, gamma)
+                    for (dist, dt) in parents]
         probs_nv[i] = sum(failures)
     probs.sort()
     probs = probs[::-1]
     cdef:
         np.ndarray[DTYPE_t] cums = probs.cumsum()
+    ll =  probs_fail.sum()
+    ll += probs_nv.sum()
+
     for i in xrange(n_victims - 1, 0, -1):
-        roots = n_victims - 1 - i
+        # iterate over all victim nodes to find the optimal threshold
+        roots = n_roots + n_victims - 1 - i
         beta = 1. / (1. + exp(-probs[i]))
-        if beta > float(roots) / float(n_nodes):
+        if beta > float(roots) / age:
             break
     else:
         print "alpha: {0}, delta: {1}. Everyone is a root".format(alpha, delta)
-        roots = n_victims
-    beta = float(roots) / float(n_nodes)
-    return (beta, roots,
-            roots * log(beta) + (n_nodes - roots) * log(1 - beta) + cums[i]
-            + probs_nv.sum()
-            + probs_fail.sum())
+        roots = n_victims + n_roots
+        i = -1
+    beta = float(roots) / age
+    for i in xrange(n_victims - 1, 0, -1):
+        if probs[i] >= log(beta/(1.- beta)):
+            break
+    ll += age * log(1 - beta)
+    if i >= 0:
+        ll += cums[i]
+    if roots > 0:
+        ll += roots * log(beta) - roots * log(1 - beta)
+    return (beta, roots, ll)
diff --git a/experiments/plot3d.py b/experiments/plot3d.py
new file mode 100644
index 0000000..64c144d
--- /dev/null
+++ b/experiments/plot3d.py
@@ -0,0 +1,23 @@
+from mpl_toolkits.mplot3d import Axes3D
+from matplotlib import cm
+import matplotlib.pyplot as plt
+import numpy as np
+
+with open("out.log") as fh:
+    values = [map(float, line.strip().split()) for line in fh]
+    #values = [(b, a, l) for (b, a, l) in values if b >= 0.04]
+    am = max(values, key=lambda x: x[4])
+    am[0] = 1./am[0]
+    print am
+    alpha, delta, beta, _ , l = zip(*values)
+    alpha = 1./np.array(alpha)
+
+    fig = plt.figure(figsize=(12, 8))
+    ax = fig.gca(projection='3d')
+
+    ax.plot_trisurf(alpha, delta, l, cmap=cm.jet, linewidth=0.001)
+    plt.xlabel("alpha")
+    plt.ylabel("delta")
+    ax.set_zlabel('Likelihood')
+    #plt.savefig("ll.pdf")
+    plt.show()
diff --git a/experiments/process.py b/experiments/process.py
index b5b70ca..5fc55ba 100644
--- a/experiments/process.py
+++ b/experiments/process.py
@@ -1,51 +1,25 @@
-from csv import DictReader
 import sys
 from ml import ml
 import numpy as np
-from cPickle import dump, load
+from cPickle import load
 from itertools import product
+from math import exp
 
 
-def build_network(filename):
-    victims = {}
-    non_victims = {}
-    with open(filename) as fh:
-        reader = DictReader(fh)
-        for row in reader:
-            from_, to = int(float(row["from"])), int(float(row["to"]))
-            if row["t2"] != "NA":
-                dt = int(row["t2"]) - int(row["t1"])
-                parent = (int(row["dist"]), dt)
-                if to not in victims:
-                    victims[to] = []
-                victims[to].append(parent)
-                if from_ not in victims:
-                    victims[from_] = []
-            else:
-                from_, to = int(float(row["from"])), int(float(row["to"]))
-                parent = (int(row["dist"]), 3012 - int(row["t1"]))
-                if to not in victims:
-                    non_victims[to] = []
-                non_victims[to].append(parent)
-                if from_ not in victims:
-                    victims[from_] = []
-    root_victims = {}
-    for victim in victims.keys():
-        if not victims[victim]:
-            del victims[victim]
-            root_victims[victim] = []
-    return root_victims, victims, non_victims
+def print_ll(alpha, delta):
+    beta, roots, ll = ml(root_victims, victims, non_victims, alpha, delta)
+    print "\t".join(map(str, [alpha, delta, beta, roots, ll, exp(ll)])) + "\n"
 
 
 if __name__ == "__main__":
-    #root_victims, victims, non_victims = build_network(sys.argv[1])
-    #dump((root_victims, victims, non_victims), open("network.pickle", "w"))
-    root_victims, victims, non_victims = load(open("network.pickle"))
+    if len(sys.argv) < 2:
+        sys.exit("usage: {0} <file>".format(sys.argv[0]))
 
-    alpha = np.arange(0.0000005, 0.00000051, 0.000001)
-    delta = np.arange(1., 1.000001, 0.001)
+    root_victims, victims, non_victims, age = load(open(sys.argv[1]))
+    alpha = 1. / np.arange(1., 1000., 10.)  # parameter of the time component
+    delta = np.arange(0.01, 0.9, 0.03)  # parameter of the structural component
     with open("out.log", "a") as fh:
         for a, d in product(alpha, delta):
-            beta, roots, ll = ml(root_victims, victims, non_victims, a, d)
+            beta, roots, ll = ml(root_victims, victims, non_victims, age, a, d)
             fh.write("\t".join(map(str, [a, d, beta, roots, ll])) + "\n")
             fh.flush()