From 9de35421f25bf45158187daea4ddfedd1c93f3d8 Mon Sep 17 00:00:00 2001 From: jeanpouget-abadie Date: Sun, 7 Dec 2014 12:08:31 -0500 Subject: renaming directory + creating dataset directory --- src/algorithms.py | 103 ++++++++++++++++++++++++++++ src/cascade_creation.py | 162 +++++++++++++++++++++++++++++++++++++++++++++ src/convex_optimization.py | 135 +++++++++++++++++++++++++++++++++++++ src/make_plots.py | 67 +++++++++++++++++++ src/rip_condition.py | 49 ++++++++++++++ src/timeout.py | 25 +++++++ 6 files changed, 541 insertions(+) create mode 100644 src/algorithms.py create mode 100644 src/cascade_creation.py create mode 100644 src/convex_optimization.py create mode 100644 src/make_plots.py create mode 100644 src/rip_condition.py create mode 100644 src/timeout.py (limited to 'src') diff --git a/src/algorithms.py b/src/algorithms.py new file mode 100644 index 0000000..0e240c9 --- /dev/null +++ b/src/algorithms.py @@ -0,0 +1,103 @@ +import numpy as np +import networkx as nx +import cascade_creation +from collections import Counter +from itertools import izip +import convex_optimization +import timeout + + +def greedy_prediction(G, cascades): + """ + Returns estimated graph from Greedy algorithm in "Learning Epidemic ..." + Only words for independent cascade model! + """ + G_hat = cascade_creation.InfluenceGraph(max_proba=None) + G_hat.add_nodes_from(G.nodes()) + for node in G_hat.nodes(): + print node + # Avoid cases where infection time is None or 0 + tmp = [cascade for cascade in cascades if cascade.infection_time(node) + [0]] + while tmp: + parents = Counter() + for cascade in tmp: + parents += cascade.candidate_infectors(node) + parent = parents.most_common(1)[0][0] + G_hat.add_edge(parent, node) + tmp = [cascade for cascade in tmp if ( + cascade.infection_time(parent)[0] is not None and + cascade.infection_time(parent)[0]+1 not in + cascade.infection_time(node))] + return G_hat + + +def recovery_l1obj_l2constraint(G, cascades, floor_cstt, passed_function, + *args, **kwargs): + """ + Returns estimated graph from convex program specified by passed_function + passed_function should have similar structure to ones in convex_optimation + """ + G_hat = cascade_creation.InfluenceGraph(max_proba=None) + G_hat.add_nodes_from(G.nodes()) + for node in G_hat.nodes(): + print node + try: + M, w = cascade_creation.icc_matrixvector_for_node(cascades, node) + p_node, __ = passed_function(M,w, *args, **kwargs) + G_hat = cascade_creation.add_edges_from_proba_vector(G=G_hat, + p_node=p_node, node=node, floor_cstt=floor_cstt) + except timeout.TimeoutError: + print "TimeoutError, skipping to next node" + return G_hat + + +def correctness_measure(G, G_hat, print_values=False): + """ + Measures correctness of estimated graph G_hat to ground truth G + """ + edges = set(G.edges()) + edges_hat = set(G_hat.edges()) + fp = len(edges_hat - edges) + fn = len(edges - edges_hat) + tp = len(edges | edges_hat) + tn = G.number_of_nodes() ** 2 - fp - fn - tp + + #Other metrics + precision = 1. * tp / (tp + fp) + recall = 1. * tp / (tp + fn) + f1_score = 2.* tp / (2 * tp + fp + fn) + + if print_values: + print "False Positives: {}".format(fp) + print "False Negatives: {}".format(fn) + print "True Positives: {}".format(tp) + print "True Negatives: {}".format(tn) + print "-------------------------------" + print "Precision: {}".format(precision) + print "Recall: {}".format(recall) + print "F1 score: {}".format(f1_score) + + return fp, fn, tp, tn + + +def test(): + """ + unit test + """ + G = cascade_creation.InfluenceGraph(max_proba = .8) + G.erdos_init(n = 100, p = .05) + import time + t0 = time.time() + A = cascade_creation.generate_cascades(G, .2, 10000) + if 1: + G_hat = greedy_prediction(G, A) + if 0: + G_hat = recovery_l1obj_l2constraint(G, A, + passed_function=convex_optimization.l1obj_l2penalization, + floor_cstt=.1, lbda=10) + correctness_measure(G, G_hat, print_values=True) + + +if __name__=="__main__": + test() diff --git a/src/cascade_creation.py b/src/cascade_creation.py new file mode 100644 index 0000000..9a26c03 --- /dev/null +++ b/src/cascade_creation.py @@ -0,0 +1,162 @@ +import networkx as nx +import numpy as np +import collections +from itertools import izip +from sklearn.preprocessing import normalize + +class InfluenceGraph(nx.Graph): + """ + networkX graph with mat and logmat attributes + """ + def __init__(self, max_proba, *args, **kwargs): + self.max_proba = max_proba + super(InfluenceGraph, self).__init__(*args, **kwargs) + + def erdos_init(self, n, p): + G = nx.erdos_renyi_graph(n, p, directed=True) + self.add_edges_from(G.edges()) + + def import_from_file(self, file_name): + """ + Takes a file from the Stanford collection of networks + """ + with open(file_name, 'r') as f: + for edge in f: + if "#" not in edge: + u, v = [int(node) for node in edge.split()] + self.add_edge(u, v) + + @property + def mat(self): + if not hasattr(self, '_mat'): + self._mat = (self.max_proba * np.random.rand(len(self), len(self)) + * np.asarray(nx.adjacency_matrix(self))) + return self._mat + + @property + def logmat(self): + if not hasattr(self, '_logmat'): + self._logmat = np.log(1 - self.mat) + return self._logmat + + +class Cascade(list): + """ + Cascade object: list with attributes + """ + def __init__(self, *args, **kwargs): + super(Cascade, self).__init__(*args, **kwargs) + + def infection_time(self, node): + """ + Returns lists of infections times for node i in cascade + """ + infected_times = [] + for t, infected_set in izip(xrange(len(self)), self): + if infected_set[node]: + infected_times.append(t) + if not infected_times: + infected_times.append(None) + return infected_times + + def candidate_infectors(self, node): + """ + Returns Counter of nodes infected just before node i was + """ + candidate_infectors = collections.Counter() + for t in self.infection_time(node): + if t: # avoid cases where t=0 or t is None + candidate_infectors.update(np.where(self[t-1])[0]) + return candidate_infectors + + +def icc_cascade(G, p_init): + """ + Returns boolean vectors for one cascade + True means node was active at that time step + p_init: proba that node in seed set + """ + susceptible = np.ones(G.number_of_nodes(), dtype=bool) + active = np.random.rand(G.number_of_nodes()) < p_init + susceptible = susceptible & np.logical_not(active) + cascade = Cascade() + while active.any(): + cascade.append(active) + active = np.exp(np.dot(G.logmat, active)) \ + < np.random.rand(G.number_of_nodes()) + active = active & susceptible + susceptible = susceptible & np.logical_not(active) + if not cascade: + print "Empty cascade, consider changing p_init or n_nodes. Retrying." + return icc_cascade(G, p_init) + return cascade + + +def generate_cascades(G, p_init, n_cascades): + """ + returns list of cascades + """"" + return [icc_cascade(G,p_init) for i in xrange(n_cascades)] + + +def icc_matrixvector_for_node(cascades, node): + """ + for the ICC model: + Returns M, w in matrix form where rows of M are i = t + k.T + Excludes all (t,k) after node infection time; w = 1_{infected} + """ + #TODO: you need to remove the variable corresponding to the node + #you are solving for!!!! + if node is None: + return np.vstack(cascades), None + else: + w = [] + M = [] + for cascade in cascades: + t_i = cascade.infection_time(node)[0] + if t_i != 0: + indicator = np.zeros(len(cascade[:t_i])) + if t_i > 0: + indicator[-1] = 1 + w.append(indicator) + M.append(np.array(cascade[:t_i])) + M = np.vstack(M) + w = np.hstack(w) + return M, w + + +def normalize_matrix(M): + """ + returns M with normalized (L_1 norm) columns + """ + return normalize(M.astype("float32"), axis=0, norm="l2") + + +def add_edges_from_proba_vector(G, p_node, node, floor_cstt): + """ + Takes proba vector, node and adds edges to G by flooring very small + probabilities + Also updates G's mat matrix + """ + floor_parent = np.nonzero(p_node*(p_node > floor_cstt)) + for parent in floor_parent[0]: + G.add_edge(parent, node) + #TODO: update G's mat matrix + return G + + +def test(): + """ + unit test + """ + G = InfluenceGraph(max_proba = .3) + G.erdos_init(n = 10, p = 1) + import time + t0 = time.time() + A = generate_cascades(G, .1, 2) + M, w = icc_matrixvector_for_node(A, 0) + t1 = time.time() + print t1 - t0 + +if __name__ == "__main__": + test() diff --git a/src/convex_optimization.py b/src/convex_optimization.py new file mode 100644 index 0000000..530e3f5 --- /dev/null +++ b/src/convex_optimization.py @@ -0,0 +1,135 @@ +import theano +import cascade_creation +from theano import tensor, function +import numpy as np +import timeout +import cvxopt + + +@timeout.timeout(10) +def l1obj_l2constraint(M_val, w_val): + """ + Solves: + min - sum_j theta_j + s.t theta_j <= 0 + |e^{M*theta} - (1 - w)|_2 <= 1 + """ + assert len(M_val) == len(w_val) + + if M_val.dtype == bool: + M_val = M_val.astype('float32') + + m, n = M_val.shape + c = cvxopt.matrix(-1.0, (n,1)) + + theta = tensor.row().T + z = tensor.row().T + theta_ = theta.flatten() + z_ = z.flatten() + M = theano.shared(M_val.astype(theano.config.floatX)) + w = theano.shared(w_val.astype(theano.config.floatX)) + y = (tensor.exp(M.dot(theta_)) - (1 - w)).norm(2) - 1 + y_diff = tensor.grad(y, theta_) + y_hess = z[0] * theano.gradient.hessian(y, theta_) + f_x = theano.function([theta], [y, y_diff], allow_input_downcast=True) + f_xz = theano.function([theta, z], [y, y_diff, y_hess], + allow_input_downcast=True) + + def F(x=None, z=None): + if x is None: + return 1, cvxopt.matrix(.0001, (n,1)) + elif z is None: + y, y_diff = f_x(x) + return cvxopt.matrix(float(y), (1, 1)),\ + cvxopt.matrix(y_diff.astype("float64")).T + else: + y, y_diff, y_hess = f_xz(x, z) + return cvxopt.matrix(float(y), (1, 1)), \ + cvxopt.matrix(y_diff.astype("float64")).T, \ + cvxopt.matrix(y_hess.astype("float64")) + + G = cvxopt.spdiag([1 for i in xrange(n)]) + h = cvxopt.matrix(0.0, (n,1)) + + cvxopt.solvers.options['show_progress'] = False + try: + theta = cvxopt.solvers.cpl(c,F, G, h)['x'] + except ArithmeticError: + print "ArithmeticError thrown, change initial point"+\ + " given to the solver" + + return 1 - np.exp(theta), theta + + +@timeout.timeout(10) +def l1obj_l2penalization(M_val, w_val, lbda): + """ + Solves: + min - sum_j theta_j + lbda*|e^{M*theta} - (1 - w)|_2 + s.t theta_j <= 0 + """ + assert len(M_val) == len(w_val) + + if M_val.dtype == bool: + M_val = M_val.astype('float32') + + if type(lbda) == int: + lbda = np.array(lbda) + + m, n = M_val.shape + + theta = tensor.row().T + z = tensor.row().T + theta_ = theta.flatten() + z_ = z.flatten() + M = theano.shared(M_val.astype(theano.config.floatX)) + w = theano.shared(w_val.astype(theano.config.floatX)) + lbda = theano.shared(lbda.astype(theano.config.floatX)) + y = (theta_).norm(1) + lbda * ( + tensor.exp(M.dot(theta_)) - (1 - w)).norm(2) + y_diff = tensor.grad(y, theta_) + y_hess = z[0] * theano.gradient.hessian(y, theta_) + f_x = theano.function([theta], [y, y_diff], allow_input_downcast=True) + f_xz = theano.function([theta, z], [y, y_diff, y_hess], + allow_input_downcast=True) + + def F(x=None, z=None): + if x is None: + return 0, cvxopt.matrix(.0001, (n,1)) + elif z is None: + y, y_diff = f_x(x) + return cvxopt.matrix(float(y), (1, 1)),\ + cvxopt.matrix(y_diff.astype("float64")).T + else: + y, y_diff, y_hess = f_xz(x, z) + return cvxopt.matrix(float(y), (1, 1)), \ + cvxopt.matrix(y_diff.astype("float64")).T, \ + cvxopt.matrix(y_hess.astype("float64")) + + G = cvxopt.spdiag([1 for i in xrange(n)]) + h = cvxopt.matrix(0.0, (n,1)) + + cvxopt.solvers.options['show_progress'] = False + try: + theta = cvxopt.solvers.cp(F, G, h)['x'] + except ArithmeticError: + print "ArithmeticError thrown, change initial point"+\ + " given to the solver" + + return 1 - np.exp(theta), theta + + +def test(): + """ + unit test + """ + lbda = 10 + G = cascade_creation.InfluenceGraph(max_proba=.8) + G.erdos_init(n=100, p = .1) + A = cascade_creation.generate_cascades(G, .1, 2000) + M_val, w_val = cascade_creation.icc_matrixvector_for_node(A, 0) + p_vec, theta = l1obj_l2penalization(M_val, w_val, lbda) + print p_vec + +if __name__=="__main__": + test() diff --git a/src/make_plots.py b/src/make_plots.py new file mode 100644 index 0000000..9b1fdf4 --- /dev/null +++ b/src/make_plots.py @@ -0,0 +1,67 @@ +import matplotlib.pyplot as plt +import numpy as np +import cascade_creation +import convex_optimization +import algorithms +import rip_condition + + +def plot_rip_numberofnodes(max_proba, n_min, n_max, p_init, n_cascades, K_max): + """ + Plots the RIP constant for varying number of nodes (n_max included) + """ + x = np.arange(n_min, n_max+1) + y = [] + + for n_nodes in x: + print n_nodes + G = cascade_creation.InfluenceGraph(max_proba=.3) + G.erdos_init(n=n_nodes, p=.1) #TODO: handle different inits! + cascades = cascade_creation.generate_cascades(G, p_init=p_init, + n_cascades=n_cascades) + M, __ = cascade_creation.icc_matrixvector_for_node(cascades, None) + M = cascade_creation.normalize_matrix(M) + y.append(rip_condition.find_kth_rip_constants(M, 4)) # + + print y + + plt.clf() + plt.plot(x, y) + #plt.show() + + return x, y + + +def compare_greedy_and_lagrange_cs284r(): + """ + Compares the performance of the greedy algorithm on the + lagrangian sparse recovery objective on the Facebook dataset + for the CS284r project + """ + G = cascade_creation.InfluenceGraph(max_proba = .8) + G.import_from_file("subset_facebook_SNAP.txt") + A = cascade_creation.generate_cascades(G, p_init=.05, n_cascades=100) + + #Greedy + G_hat = algorithms.greedy_prediction(G, A) + algorithms.correctness_measure(G, G_hat, print_values=True) + + #Lagrange Objective + G_hat = algorithms.recovery_l1obj_l2constraint(G, A, + passed_function=convex_optimization.l1obj_l2penalization, + floor_cstt=.1, lbda=10) + algorithms.correctness_measure(G, G_hat, print_values=True) + + +def test(): + """ + unit test + """ + if 0: + plot_rip_numberofnodes(max_proba=.3, n_min=30, n_max=30, + p_init=.01, n_cascades=100, K_max=4) + if 1: + compare_greedy_and_lagrange_cs284r() + +if __name__=="__main__": + test() diff --git a/src/rip_condition.py b/src/rip_condition.py new file mode 100644 index 0000000..fccf9a4 --- /dev/null +++ b/src/rip_condition.py @@ -0,0 +1,49 @@ +import numpy as np +import cascade_creation +import itertools +from scipy import sparse +from scipy.sparse import linalg + +def find_kth_rip_constants(M, k): + """ + Returns max(A_1, A_2) where: + 1 + A_1 = arg max |Mx|_2^2 s.t. |x|_2 = 1 and |x|_0 = k + 1 - A_2 = arg min |Mx|_2^2 s.t. |x|_2 = 1 and |x|_0 = kx + """ + delta = 0 + print M.shape + for col_set in itertools.combinations(xrange(M.shape[1]), k): + M_kcol = M[:,list(col_set)] + delta_upper, delta_lower = upperlower_bound_rip(M_kcol) + delta = max(delta, max(delta_upper, delta_lower)) + return delta + + +def upperlower_bound_rip(M): + """ + returns arg max/min |Mx|_2^2 s.t. |x|_2 = 1 + which is the greatest eigenvalue value of M^T*M + or the square of the greatest singular value of M + """ + M = sparse.csc_matrix(M) + s_upper = linalg.svds(M, 1, tol=.01, which ='LM', maxiter = 2000, + return_singular_vectors=False) + s_lower = linalg.svds(M, 1, tol=.01, which = 'SM', maxiter= 2000, + return_singular_vectors=False) + return s_upper ** 2 - 1, 1 -s_lower ** 2 + + +def test(): + """ + unit test + """ + G = cascade_creation.InfluenceGraph(max_proba=.3) + G.erdos_init(n=10, p =1) + cascades = cascade_creation.generate_cascades(G, p_init=.3, n_cascades=10) + M, __ = cascade_creation.icc_matrixvector_for_node(cascades, None) + M = cascade_creation.normalize_matrix(M) + print find_kth_rip_constants(M, 5) + + +if __name__=="__main__": + test() diff --git a/src/timeout.py b/src/timeout.py new file mode 100644 index 0000000..d7381c3 --- /dev/null +++ b/src/timeout.py @@ -0,0 +1,25 @@ +from functools import wraps +import errno +import os +import signal + +class TimeoutError(Exception): + pass + +def timeout(seconds=10, error_message=os.strerror(errno.ETIME)): + def decorator(func): + def _handle_timeout(signum, frame): + raise TimeoutError(error_message) + + def wrapper(*args, **kwargs): + signal.signal(signal.SIGALRM, _handle_timeout) + signal.alarm(seconds) + try: + result = func(*args, **kwargs) + finally: + signal.alarm(0) + return result + + return wraps(func)(wrapper) + + return decorator -- cgit v1.2.3-70-g09d2