aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/algorithms.py103
-rw-r--r--src/cascade_creation.py162
-rw-r--r--src/convex_optimization.py135
-rw-r--r--src/make_plots.py67
-rw-r--r--src/rip_condition.py49
-rw-r--r--src/timeout.py25
6 files changed, 541 insertions, 0 deletions
diff --git a/src/algorithms.py b/src/algorithms.py
new file mode 100644
index 0000000..0e240c9
--- /dev/null
+++ b/src/algorithms.py
@@ -0,0 +1,103 @@
+import numpy as np
+import networkx as nx
+import cascade_creation
+from collections import Counter
+from itertools import izip
+import convex_optimization
+import timeout
+
+
+def greedy_prediction(G, cascades):
+ """
+ Returns estimated graph from Greedy algorithm in "Learning Epidemic ..."
+ Only words for independent cascade model!
+ """
+ G_hat = cascade_creation.InfluenceGraph(max_proba=None)
+ G_hat.add_nodes_from(G.nodes())
+ for node in G_hat.nodes():
+ print node
+ # Avoid cases where infection time is None or 0
+ tmp = [cascade for cascade in cascades if cascade.infection_time(node)
+ [0]]
+ while tmp:
+ parents = Counter()
+ for cascade in tmp:
+ parents += cascade.candidate_infectors(node)
+ parent = parents.most_common(1)[0][0]
+ G_hat.add_edge(parent, node)
+ tmp = [cascade for cascade in tmp if (
+ cascade.infection_time(parent)[0] is not None and
+ cascade.infection_time(parent)[0]+1 not in
+ cascade.infection_time(node))]
+ return G_hat
+
+
+def recovery_l1obj_l2constraint(G, cascades, floor_cstt, passed_function,
+ *args, **kwargs):
+ """
+ Returns estimated graph from convex program specified by passed_function
+ passed_function should have similar structure to ones in convex_optimation
+ """
+ G_hat = cascade_creation.InfluenceGraph(max_proba=None)
+ G_hat.add_nodes_from(G.nodes())
+ for node in G_hat.nodes():
+ print node
+ try:
+ M, w = cascade_creation.icc_matrixvector_for_node(cascades, node)
+ p_node, __ = passed_function(M,w, *args, **kwargs)
+ G_hat = cascade_creation.add_edges_from_proba_vector(G=G_hat,
+ p_node=p_node, node=node, floor_cstt=floor_cstt)
+ except timeout.TimeoutError:
+ print "TimeoutError, skipping to next node"
+ return G_hat
+
+
+def correctness_measure(G, G_hat, print_values=False):
+ """
+ Measures correctness of estimated graph G_hat to ground truth G
+ """
+ edges = set(G.edges())
+ edges_hat = set(G_hat.edges())
+ fp = len(edges_hat - edges)
+ fn = len(edges - edges_hat)
+ tp = len(edges | edges_hat)
+ tn = G.number_of_nodes() ** 2 - fp - fn - tp
+
+ #Other metrics
+ precision = 1. * tp / (tp + fp)
+ recall = 1. * tp / (tp + fn)
+ f1_score = 2.* tp / (2 * tp + fp + fn)
+
+ if print_values:
+ print "False Positives: {}".format(fp)
+ print "False Negatives: {}".format(fn)
+ print "True Positives: {}".format(tp)
+ print "True Negatives: {}".format(tn)
+ print "-------------------------------"
+ print "Precision: {}".format(precision)
+ print "Recall: {}".format(recall)
+ print "F1 score: {}".format(f1_score)
+
+ return fp, fn, tp, tn
+
+
+def test():
+ """
+ unit test
+ """
+ G = cascade_creation.InfluenceGraph(max_proba = .8)
+ G.erdos_init(n = 100, p = .05)
+ import time
+ t0 = time.time()
+ A = cascade_creation.generate_cascades(G, .2, 10000)
+ if 1:
+ G_hat = greedy_prediction(G, A)
+ if 0:
+ G_hat = recovery_l1obj_l2constraint(G, A,
+ passed_function=convex_optimization.l1obj_l2penalization,
+ floor_cstt=.1, lbda=10)
+ correctness_measure(G, G_hat, print_values=True)
+
+
+if __name__=="__main__":
+ test()
diff --git a/src/cascade_creation.py b/src/cascade_creation.py
new file mode 100644
index 0000000..9a26c03
--- /dev/null
+++ b/src/cascade_creation.py
@@ -0,0 +1,162 @@
+import networkx as nx
+import numpy as np
+import collections
+from itertools import izip
+from sklearn.preprocessing import normalize
+
+class InfluenceGraph(nx.Graph):
+ """
+ networkX graph with mat and logmat attributes
+ """
+ def __init__(self, max_proba, *args, **kwargs):
+ self.max_proba = max_proba
+ super(InfluenceGraph, self).__init__(*args, **kwargs)
+
+ def erdos_init(self, n, p):
+ G = nx.erdos_renyi_graph(n, p, directed=True)
+ self.add_edges_from(G.edges())
+
+ def import_from_file(self, file_name):
+ """
+ Takes a file from the Stanford collection of networks
+ """
+ with open(file_name, 'r') as f:
+ for edge in f:
+ if "#" not in edge:
+ u, v = [int(node) for node in edge.split()]
+ self.add_edge(u, v)
+
+ @property
+ def mat(self):
+ if not hasattr(self, '_mat'):
+ self._mat = (self.max_proba * np.random.rand(len(self), len(self))
+ * np.asarray(nx.adjacency_matrix(self)))
+ return self._mat
+
+ @property
+ def logmat(self):
+ if not hasattr(self, '_logmat'):
+ self._logmat = np.log(1 - self.mat)
+ return self._logmat
+
+
+class Cascade(list):
+ """
+ Cascade object: list with attributes
+ """
+ def __init__(self, *args, **kwargs):
+ super(Cascade, self).__init__(*args, **kwargs)
+
+ def infection_time(self, node):
+ """
+ Returns lists of infections times for node i in cascade
+ """
+ infected_times = []
+ for t, infected_set in izip(xrange(len(self)), self):
+ if infected_set[node]:
+ infected_times.append(t)
+ if not infected_times:
+ infected_times.append(None)
+ return infected_times
+
+ def candidate_infectors(self, node):
+ """
+ Returns Counter of nodes infected just before node i was
+ """
+ candidate_infectors = collections.Counter()
+ for t in self.infection_time(node):
+ if t: # avoid cases where t=0 or t is None
+ candidate_infectors.update(np.where(self[t-1])[0])
+ return candidate_infectors
+
+
+def icc_cascade(G, p_init):
+ """
+ Returns boolean vectors for one cascade
+ True means node was active at that time step
+ p_init: proba that node in seed set
+ """
+ susceptible = np.ones(G.number_of_nodes(), dtype=bool)
+ active = np.random.rand(G.number_of_nodes()) < p_init
+ susceptible = susceptible & np.logical_not(active)
+ cascade = Cascade()
+ while active.any():
+ cascade.append(active)
+ active = np.exp(np.dot(G.logmat, active)) \
+ < np.random.rand(G.number_of_nodes())
+ active = active & susceptible
+ susceptible = susceptible & np.logical_not(active)
+ if not cascade:
+ print "Empty cascade, consider changing p_init or n_nodes. Retrying."
+ return icc_cascade(G, p_init)
+ return cascade
+
+
+def generate_cascades(G, p_init, n_cascades):
+ """
+ returns list of cascades
+ """""
+ return [icc_cascade(G,p_init) for i in xrange(n_cascades)]
+
+
+def icc_matrixvector_for_node(cascades, node):
+ """
+ for the ICC model:
+ Returns M, w in matrix form where rows of M are i = t + k.T
+ Excludes all (t,k) after node infection time; w = 1_{infected}
+ """
+ #TODO: you need to remove the variable corresponding to the node
+ #you are solving for!!!!
+ if node is None:
+ return np.vstack(cascades), None
+ else:
+ w = []
+ M = []
+ for cascade in cascades:
+ t_i = cascade.infection_time(node)[0]
+ if t_i != 0:
+ indicator = np.zeros(len(cascade[:t_i]))
+ if t_i > 0:
+ indicator[-1] = 1
+ w.append(indicator)
+ M.append(np.array(cascade[:t_i]))
+ M = np.vstack(M)
+ w = np.hstack(w)
+ return M, w
+
+
+def normalize_matrix(M):
+ """
+ returns M with normalized (L_1 norm) columns
+ """
+ return normalize(M.astype("float32"), axis=0, norm="l2")
+
+
+def add_edges_from_proba_vector(G, p_node, node, floor_cstt):
+ """
+ Takes proba vector, node and adds edges to G by flooring very small
+ probabilities
+ Also updates G's mat matrix
+ """
+ floor_parent = np.nonzero(p_node*(p_node > floor_cstt))
+ for parent in floor_parent[0]:
+ G.add_edge(parent, node)
+ #TODO: update G's mat matrix
+ return G
+
+
+def test():
+ """
+ unit test
+ """
+ G = InfluenceGraph(max_proba = .3)
+ G.erdos_init(n = 10, p = 1)
+ import time
+ t0 = time.time()
+ A = generate_cascades(G, .1, 2)
+ M, w = icc_matrixvector_for_node(A, 0)
+ t1 = time.time()
+ print t1 - t0
+
+if __name__ == "__main__":
+ test()
diff --git a/src/convex_optimization.py b/src/convex_optimization.py
new file mode 100644
index 0000000..530e3f5
--- /dev/null
+++ b/src/convex_optimization.py
@@ -0,0 +1,135 @@
+import theano
+import cascade_creation
+from theano import tensor, function
+import numpy as np
+import timeout
+import cvxopt
+
+
+@timeout.timeout(10)
+def l1obj_l2constraint(M_val, w_val):
+ """
+ Solves:
+ min - sum_j theta_j
+ s.t theta_j <= 0
+ |e^{M*theta} - (1 - w)|_2 <= 1
+ """
+ assert len(M_val) == len(w_val)
+
+ if M_val.dtype == bool:
+ M_val = M_val.astype('float32')
+
+ m, n = M_val.shape
+ c = cvxopt.matrix(-1.0, (n,1))
+
+ theta = tensor.row().T
+ z = tensor.row().T
+ theta_ = theta.flatten()
+ z_ = z.flatten()
+ M = theano.shared(M_val.astype(theano.config.floatX))
+ w = theano.shared(w_val.astype(theano.config.floatX))
+ y = (tensor.exp(M.dot(theta_)) - (1 - w)).norm(2) - 1
+ y_diff = tensor.grad(y, theta_)
+ y_hess = z[0] * theano.gradient.hessian(y, theta_)
+ f_x = theano.function([theta], [y, y_diff], allow_input_downcast=True)
+ f_xz = theano.function([theta, z], [y, y_diff, y_hess],
+ allow_input_downcast=True)
+
+ def F(x=None, z=None):
+ if x is None:
+ return 1, cvxopt.matrix(.0001, (n,1))
+ elif z is None:
+ y, y_diff = f_x(x)
+ return cvxopt.matrix(float(y), (1, 1)),\
+ cvxopt.matrix(y_diff.astype("float64")).T
+ else:
+ y, y_diff, y_hess = f_xz(x, z)
+ return cvxopt.matrix(float(y), (1, 1)), \
+ cvxopt.matrix(y_diff.astype("float64")).T, \
+ cvxopt.matrix(y_hess.astype("float64"))
+
+ G = cvxopt.spdiag([1 for i in xrange(n)])
+ h = cvxopt.matrix(0.0, (n,1))
+
+ cvxopt.solvers.options['show_progress'] = False
+ try:
+ theta = cvxopt.solvers.cpl(c,F, G, h)['x']
+ except ArithmeticError:
+ print "ArithmeticError thrown, change initial point"+\
+ " given to the solver"
+
+ return 1 - np.exp(theta), theta
+
+
+@timeout.timeout(10)
+def l1obj_l2penalization(M_val, w_val, lbda):
+ """
+ Solves:
+ min - sum_j theta_j + lbda*|e^{M*theta} - (1 - w)|_2
+ s.t theta_j <= 0
+ """
+ assert len(M_val) == len(w_val)
+
+ if M_val.dtype == bool:
+ M_val = M_val.astype('float32')
+
+ if type(lbda) == int:
+ lbda = np.array(lbda)
+
+ m, n = M_val.shape
+
+ theta = tensor.row().T
+ z = tensor.row().T
+ theta_ = theta.flatten()
+ z_ = z.flatten()
+ M = theano.shared(M_val.astype(theano.config.floatX))
+ w = theano.shared(w_val.astype(theano.config.floatX))
+ lbda = theano.shared(lbda.astype(theano.config.floatX))
+ y = (theta_).norm(1) + lbda * (
+ tensor.exp(M.dot(theta_)) - (1 - w)).norm(2)
+ y_diff = tensor.grad(y, theta_)
+ y_hess = z[0] * theano.gradient.hessian(y, theta_)
+ f_x = theano.function([theta], [y, y_diff], allow_input_downcast=True)
+ f_xz = theano.function([theta, z], [y, y_diff, y_hess],
+ allow_input_downcast=True)
+
+ def F(x=None, z=None):
+ if x is None:
+ return 0, cvxopt.matrix(.0001, (n,1))
+ elif z is None:
+ y, y_diff = f_x(x)
+ return cvxopt.matrix(float(y), (1, 1)),\
+ cvxopt.matrix(y_diff.astype("float64")).T
+ else:
+ y, y_diff, y_hess = f_xz(x, z)
+ return cvxopt.matrix(float(y), (1, 1)), \
+ cvxopt.matrix(y_diff.astype("float64")).T, \
+ cvxopt.matrix(y_hess.astype("float64"))
+
+ G = cvxopt.spdiag([1 for i in xrange(n)])
+ h = cvxopt.matrix(0.0, (n,1))
+
+ cvxopt.solvers.options['show_progress'] = False
+ try:
+ theta = cvxopt.solvers.cp(F, G, h)['x']
+ except ArithmeticError:
+ print "ArithmeticError thrown, change initial point"+\
+ " given to the solver"
+
+ return 1 - np.exp(theta), theta
+
+
+def test():
+ """
+ unit test
+ """
+ lbda = 10
+ G = cascade_creation.InfluenceGraph(max_proba=.8)
+ G.erdos_init(n=100, p = .1)
+ A = cascade_creation.generate_cascades(G, .1, 2000)
+ M_val, w_val = cascade_creation.icc_matrixvector_for_node(A, 0)
+ p_vec, theta = l1obj_l2penalization(M_val, w_val, lbda)
+ print p_vec
+
+if __name__=="__main__":
+ test()
diff --git a/src/make_plots.py b/src/make_plots.py
new file mode 100644
index 0000000..9b1fdf4
--- /dev/null
+++ b/src/make_plots.py
@@ -0,0 +1,67 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import cascade_creation
+import convex_optimization
+import algorithms
+import rip_condition
+
+
+def plot_rip_numberofnodes(max_proba, n_min, n_max, p_init, n_cascades, K_max):
+ """
+ Plots the RIP constant for varying number of nodes (n_max included)
+ """
+ x = np.arange(n_min, n_max+1)
+ y = []
+
+ for n_nodes in x:
+ print n_nodes
+ G = cascade_creation.InfluenceGraph(max_proba=.3)
+ G.erdos_init(n=n_nodes, p=.1) #TODO: handle different inits!
+ cascades = cascade_creation.generate_cascades(G, p_init=p_init,
+ n_cascades=n_cascades)
+ M, __ = cascade_creation.icc_matrixvector_for_node(cascades, None)
+ M = cascade_creation.normalize_matrix(M)
+ y.append(rip_condition.find_kth_rip_constants(M, 4)) #
+
+ print y
+
+ plt.clf()
+ plt.plot(x, y)
+ #plt.show()
+
+ return x, y
+
+
+def compare_greedy_and_lagrange_cs284r():
+ """
+ Compares the performance of the greedy algorithm on the
+ lagrangian sparse recovery objective on the Facebook dataset
+ for the CS284r project
+ """
+ G = cascade_creation.InfluenceGraph(max_proba = .8)
+ G.import_from_file("subset_facebook_SNAP.txt")
+ A = cascade_creation.generate_cascades(G, p_init=.05, n_cascades=100)
+
+ #Greedy
+ G_hat = algorithms.greedy_prediction(G, A)
+ algorithms.correctness_measure(G, G_hat, print_values=True)
+
+ #Lagrange Objective
+ G_hat = algorithms.recovery_l1obj_l2constraint(G, A,
+ passed_function=convex_optimization.l1obj_l2penalization,
+ floor_cstt=.1, lbda=10)
+ algorithms.correctness_measure(G, G_hat, print_values=True)
+
+
+def test():
+ """
+ unit test
+ """
+ if 0:
+ plot_rip_numberofnodes(max_proba=.3, n_min=30, n_max=30,
+ p_init=.01, n_cascades=100, K_max=4)
+ if 1:
+ compare_greedy_and_lagrange_cs284r()
+
+if __name__=="__main__":
+ test()
diff --git a/src/rip_condition.py b/src/rip_condition.py
new file mode 100644
index 0000000..fccf9a4
--- /dev/null
+++ b/src/rip_condition.py
@@ -0,0 +1,49 @@
+import numpy as np
+import cascade_creation
+import itertools
+from scipy import sparse
+from scipy.sparse import linalg
+
+def find_kth_rip_constants(M, k):
+ """
+ Returns max(A_1, A_2) where:
+ 1 + A_1 = arg max |Mx|_2^2 s.t. |x|_2 = 1 and |x|_0 = k
+ 1 - A_2 = arg min |Mx|_2^2 s.t. |x|_2 = 1 and |x|_0 = kx
+ """
+ delta = 0
+ print M.shape
+ for col_set in itertools.combinations(xrange(M.shape[1]), k):
+ M_kcol = M[:,list(col_set)]
+ delta_upper, delta_lower = upperlower_bound_rip(M_kcol)
+ delta = max(delta, max(delta_upper, delta_lower))
+ return delta
+
+
+def upperlower_bound_rip(M):
+ """
+ returns arg max/min |Mx|_2^2 s.t. |x|_2 = 1
+ which is the greatest eigenvalue value of M^T*M
+ or the square of the greatest singular value of M
+ """
+ M = sparse.csc_matrix(M)
+ s_upper = linalg.svds(M, 1, tol=.01, which ='LM', maxiter = 2000,
+ return_singular_vectors=False)
+ s_lower = linalg.svds(M, 1, tol=.01, which = 'SM', maxiter= 2000,
+ return_singular_vectors=False)
+ return s_upper ** 2 - 1, 1 -s_lower ** 2
+
+
+def test():
+ """
+ unit test
+ """
+ G = cascade_creation.InfluenceGraph(max_proba=.3)
+ G.erdos_init(n=10, p =1)
+ cascades = cascade_creation.generate_cascades(G, p_init=.3, n_cascades=10)
+ M, __ = cascade_creation.icc_matrixvector_for_node(cascades, None)
+ M = cascade_creation.normalize_matrix(M)
+ print find_kth_rip_constants(M, 5)
+
+
+if __name__=="__main__":
+ test()
diff --git a/src/timeout.py b/src/timeout.py
new file mode 100644
index 0000000..d7381c3
--- /dev/null
+++ b/src/timeout.py
@@ -0,0 +1,25 @@
+from functools import wraps
+import errno
+import os
+import signal
+
+class TimeoutError(Exception):
+ pass
+
+def timeout(seconds=10, error_message=os.strerror(errno.ETIME)):
+ def decorator(func):
+ def _handle_timeout(signum, frame):
+ raise TimeoutError(error_message)
+
+ def wrapper(*args, **kwargs):
+ signal.signal(signal.SIGALRM, _handle_timeout)
+ signal.alarm(seconds)
+ try:
+ result = func(*args, **kwargs)
+ finally:
+ signal.alarm(0)
+ return result
+
+ return wraps(func)(wrapper)
+
+ return decorator