diff options
| -rw-r--r-- | experiments/Makefile | 2 | ||||
| -rw-r--r-- | experiments/README.txt | 40 | ||||
| -rw-r--r-- | experiments/build_network.py | 52 | ||||
| -rw-r--r-- | experiments/ml.pyx | 84 | ||||
| -rw-r--r-- | experiments/plot3d.py | 23 | ||||
| -rw-r--r-- | experiments/process.py | 48 |
6 files changed, 188 insertions, 61 deletions
diff --git a/experiments/Makefile b/experiments/Makefile index 991c178..0fd956a 100644 --- a/experiments/Makefile +++ b/experiments/Makefile @@ -1,6 +1,6 @@ all: cython -a ml.pyx - gcc -pthread -fno-strict-aliasing -march=x86-64 -mtune=generic -O3 -pipe -fstack-protector-strong --param=ssp-buffer-size=4 -DNDEBUG -march=x86-64 -mtune=generic -O3 -pipe -fstack-protector-strong --param=ssp-buffer-size=4 -fPIC -I/usr/include/python2.7 -c ml.c -o ml.o + gcc -pthread -fno-strict-aliasing -march=x86-64 -mtune=generic -O3 -pipe -fstack-protector-strong --param=ssp-buffer-size=4 -DNDEBUG -march=x86-64 -mtune=generic -O3 -pipe -fstack-protector-strong --param=ssp-buffer-size=4 -fPIC -I/usr/include/python2.7 -c ml.c -o ml.o gcc -pthread -shared -Wl,-O3,--sort-common,--as-needed,-z,relro ml.o -L/usr/lib -lpython2.7 -o ml.so diff --git a/experiments/README.txt b/experiments/README.txt new file mode 100644 index 0000000..13e0d3d --- /dev/null +++ b/experiments/README.txt @@ -0,0 +1,40 @@ +Description of the files +======================== + +* build_network.py: takes a .csv file containing a criminal network and builds + and extract a compressed binary representation tailored to make the other + scripts faster. + + Takes one argument, the name of the .csv file. The output file has the same + name but the extension .pickle + +* ml.pyx: code to compute the likelihood of a given alpha (time component) and + delta (structural component) for a given infection graph. The optimization of + the beta parameter (probability of being a root) is done internally. + +* process.py: simple loop to do an exhaustive search over a range of alpha + (time component) and delta (structural component). The likelihood (and a few + other things) for each of the values is printed in the file "out.log". + + Takes on argument the name of .pickle file computed by build_network.py + +* plot3d.py: code to obtain a 3d plot of the log likelihood as a function of + alpha and delta. Can also be easily modified to obtain 2d plots along + specific axes + +Installation +============ + +* most of the code uses standard python + numpy + matplotlib + +* the file ml.pyx is a Cython file which must be compiled in order to be used. +At a high level, ml.pyx is compiled into ml.so. The line "import ml" in +"process.py" will fail if the file ml.so is not present in the directory. +Important: do not forget to recompile ml.pyx after changing it, otherwise you +will be working with an outdated version of ml.so! + +How to compile? A Makefile is provided. If using make on MacOS is too +complicated, it is also possible to compile .pyx files using python distutils +(which I would assume is more standard on MacOS). This only requires distutils +and a simple setup.pu file. More details here: +http://docs.cython.org/src/reference/compilation.html#configuring-the-c-build diff --git a/experiments/build_network.py b/experiments/build_network.py new file mode 100644 index 0000000..aae90db --- /dev/null +++ b/experiments/build_network.py @@ -0,0 +1,52 @@ +from csv import DictReader +import sys +from cPickle import dump +from os.path import splitext + + +def build_network(filename): + victims = {} + non_victims = {} + age = 0. + with open(filename) as fh: + reader = DictReader(fh) + for row in reader: + from_, to = int(float(row["from"])), int(float(row["to"])) + if int(float(row["dist"])) > 2: + continue + if row["t2"] != "NA": + dt = int(row["t2"]) - int(row["t1"]) + parent = (int(row["dist"]), dt) + if to not in victims: + age += int(row["t2"]) + victims[to] = [] + victims[to].append(parent) + if from_ not in victims: + age += int(row["t1"]) + victims[from_] = [] + else: + from_, to = int(float(row["from"])), int(float(row["to"])) + parent = (int(row["dist"]), 3012 - int(row["t1"])) + if to not in victims: + age += 3012 + non_victims[to] = [] + non_victims[to].append(parent) + if from_ not in victims: + age += int(row["t1"]) + victims[from_] = [] + root_victims = {} + for victim in victims.keys(): + if not victims[victim]: + del victims[victim] + root_victims[victim] = [] + return root_victims, victims, non_victims, age + + +if __name__ == "__main__": + if len(sys.argv) < 2: + sys.exit("usage: {0} <file>".format(sys.argv[0])) + + filename = sys.argv[1] + root, _ = splitext(filename) + root_victims, victims, non_victims, age = build_network(filename) + dump((root_victims, victims, non_victims, age), open(root + ".pickle", "w")) diff --git a/experiments/ml.pyx b/experiments/ml.pyx index 48d4549..74e5be3 100644 --- a/experiments/ml.pyx +++ b/experiments/ml.pyx @@ -6,63 +6,101 @@ from libc.math cimport log, exp DTYPE = np.float64 ctypedef np.float_t DTYPE_t + cdef DTYPE_t weight_success(int dist, int dt, DTYPE_t alpha, DTYPE_t delta, DTYPE_t gamma): + """weight for successful infection, exponential time model""" cdef DTYPE_t structural, temporal, result - structural = delta ** dist + structural = delta ** (dist) temporal = exp(-alpha * dt) * (1 - exp(-alpha)) - result = structural * temporal + result = log(structural * temporal) + return result + + +cdef DTYPE_t weight_success_power(int dist, int dt, DTYPE_t alpha, + DTYPE_t delta, DTYPE_t gamma): + """weight for successful infection, power-law time model""" + cdef DTYPE_t structural, temporal, result + structural = delta ** (dist) + temporal = 1. / (1. + (dt - 1.)/alpha)**0.01 - 1. / (1. + dt/alpha)**0.01 + result = log(structural * temporal) return result cdef DTYPE_t weight_failure(int dist, int dt, DTYPE_t alpha, DTYPE_t delta, DTYPE_t gamma): + """weight for failed infection, exponential time model""" cdef DTYPE_t structural, temporal, result - structural = delta ** dist + structural = delta ** (dist) temporal = 1. - exp(-alpha * dt) - result = 1. - structural * temporal + #result = log(1. - structural) + result = log(1. - structural * temporal) return result -def ml(dict root_victims, dict victims, dict non_victims, +cdef DTYPE_t weight_failure_power(int dist, int dt, DTYPE_t alpha, + DTYPE_t delta, DTYPE_t gamma): + """weight for failed infection, power-law time model""" + cdef DTYPE_t structural, temporal, result + structural = delta ** (dist) + temporal = 1. - 1. / (1. + dt/alpha)**0.01 + result = log(1. - structural * temporal) + return result + +def ml(dict root_victims, dict victims, dict non_victims, DTYPE_t age, DTYPE_t alpha, DTYPE_t delta, DTYPE_t gamma=10): cdef: - int n_roots, n_victims, n_nodes, roots, i, dist, dt, t - DTYPE_t beta, all_failures + int n_roots, n_victims, n_nodes, roots, i, dist, dt, t, l + DTYPE_t beta, all_failures, ll, beta2 list parents, failures, successes n_roots, n_victims = len(root_victims), len(victims) - n_nodes = n_victims + len(non_victims) + n_nodes = n_victims + len(non_victims) + n_roots cdef: np.ndarray[DTYPE_t] probs = np.zeros(n_victims, dtype=DTYPE) np.ndarray[DTYPE_t] probs_fail = np.zeros(n_victims, dtype=DTYPE) np.ndarray[DTYPE_t] probs_nv = np.zeros(len(non_victims), dtype=DTYPE) for i, parents in enumerate(victims.itervalues()): - failures = [log(weight_failure(dist, dt, alpha, delta, gamma)) - for (dist, dt) in parents] + # for each victim node i, compute the probability that all its parents + # fail to infect it, also computes the probability that its most + # likely parent infects it + failures = [weight_failure(dist, dt, alpha, delta, gamma) + for (dist, dt) in parents] all_failures = sum(failures) - successes = [log(weight_success(dist, dt, alpha, delta, gamma)) - for (dist, dt) in parents] - probs[i] = max(s - failures[i] for i, s in enumerate(successes)) + successes = [weight_success(dist, dt, alpha, delta, gamma) + for (dist, dt) in parents] + probs[i] = max(s - failures[l] for l, s in enumerate(successes)) probs_fail[i] = all_failures for i, parents in enumerate(non_victims.itervalues()): - failures = [log(weight_failure(dist, dt, alpha, delta, gamma)) - for (dist, dt) in parents] + # for each non victim node, compute the probability that all its + # parents fail to infect it + failures = [weight_failure(dist, dt, alpha, delta, gamma) + for (dist, dt) in parents] probs_nv[i] = sum(failures) probs.sort() probs = probs[::-1] cdef: np.ndarray[DTYPE_t] cums = probs.cumsum() + ll = probs_fail.sum() + ll += probs_nv.sum() + for i in xrange(n_victims - 1, 0, -1): - roots = n_victims - 1 - i + # iterate over all victim nodes to find the optimal threshold + roots = n_roots + n_victims - 1 - i beta = 1. / (1. + exp(-probs[i])) - if beta > float(roots) / float(n_nodes): + if beta > float(roots) / age: break else: print "alpha: {0}, delta: {1}. Everyone is a root".format(alpha, delta) - roots = n_victims - beta = float(roots) / float(n_nodes) - return (beta, roots, - roots * log(beta) + (n_nodes - roots) * log(1 - beta) + cums[i] - + probs_nv.sum() - + probs_fail.sum()) + roots = n_victims + n_roots + i = -1 + beta = float(roots) / age + for i in xrange(n_victims - 1, 0, -1): + if probs[i] >= log(beta/(1.- beta)): + break + ll += age * log(1 - beta) + if i >= 0: + ll += cums[i] + if roots > 0: + ll += roots * log(beta) - roots * log(1 - beta) + return (beta, roots, ll) diff --git a/experiments/plot3d.py b/experiments/plot3d.py new file mode 100644 index 0000000..64c144d --- /dev/null +++ b/experiments/plot3d.py @@ -0,0 +1,23 @@ +from mpl_toolkits.mplot3d import Axes3D +from matplotlib import cm +import matplotlib.pyplot as plt +import numpy as np + +with open("out.log") as fh: + values = [map(float, line.strip().split()) for line in fh] + #values = [(b, a, l) for (b, a, l) in values if b >= 0.04] + am = max(values, key=lambda x: x[4]) + am[0] = 1./am[0] + print am + alpha, delta, beta, _ , l = zip(*values) + alpha = 1./np.array(alpha) + + fig = plt.figure(figsize=(12, 8)) + ax = fig.gca(projection='3d') + + ax.plot_trisurf(alpha, delta, l, cmap=cm.jet, linewidth=0.001) + plt.xlabel("alpha") + plt.ylabel("delta") + ax.set_zlabel('Likelihood') + #plt.savefig("ll.pdf") + plt.show() diff --git a/experiments/process.py b/experiments/process.py index b5b70ca..5fc55ba 100644 --- a/experiments/process.py +++ b/experiments/process.py @@ -1,51 +1,25 @@ -from csv import DictReader import sys from ml import ml import numpy as np -from cPickle import dump, load +from cPickle import load from itertools import product +from math import exp -def build_network(filename): - victims = {} - non_victims = {} - with open(filename) as fh: - reader = DictReader(fh) - for row in reader: - from_, to = int(float(row["from"])), int(float(row["to"])) - if row["t2"] != "NA": - dt = int(row["t2"]) - int(row["t1"]) - parent = (int(row["dist"]), dt) - if to not in victims: - victims[to] = [] - victims[to].append(parent) - if from_ not in victims: - victims[from_] = [] - else: - from_, to = int(float(row["from"])), int(float(row["to"])) - parent = (int(row["dist"]), 3012 - int(row["t1"])) - if to not in victims: - non_victims[to] = [] - non_victims[to].append(parent) - if from_ not in victims: - victims[from_] = [] - root_victims = {} - for victim in victims.keys(): - if not victims[victim]: - del victims[victim] - root_victims[victim] = [] - return root_victims, victims, non_victims +def print_ll(alpha, delta): + beta, roots, ll = ml(root_victims, victims, non_victims, alpha, delta) + print "\t".join(map(str, [alpha, delta, beta, roots, ll, exp(ll)])) + "\n" if __name__ == "__main__": - #root_victims, victims, non_victims = build_network(sys.argv[1]) - #dump((root_victims, victims, non_victims), open("network.pickle", "w")) - root_victims, victims, non_victims = load(open("network.pickle")) + if len(sys.argv) < 2: + sys.exit("usage: {0} <file>".format(sys.argv[0])) - alpha = np.arange(0.0000005, 0.00000051, 0.000001) - delta = np.arange(1., 1.000001, 0.001) + root_victims, victims, non_victims, age = load(open(sys.argv[1])) + alpha = 1. / np.arange(1., 1000., 10.) # parameter of the time component + delta = np.arange(0.01, 0.9, 0.03) # parameter of the structural component with open("out.log", "a") as fh: for a, d in product(alpha, delta): - beta, roots, ll = ml(root_victims, victims, non_victims, a, d) + beta, roots, ll = ml(root_victims, victims, non_victims, age, a, d) fh.write("\t".join(map(str, [a, d, beta, roots, ll])) + "\n") fh.flush() |
