From 9de35421f25bf45158187daea4ddfedd1c93f3d8 Mon Sep 17 00:00:00 2001
From: jeanpouget-abadie <jean.pougetabadie@gmail.com>
Date: Sun, 7 Dec 2014 12:08:31 -0500
Subject: renaming directory + creating dataset directory

---
 src/algorithms.py          | 103 ++++++++++++++++++++++++++++
 src/cascade_creation.py    | 162 +++++++++++++++++++++++++++++++++++++++++++++
 src/convex_optimization.py | 135 +++++++++++++++++++++++++++++++++++++
 src/make_plots.py          |  67 +++++++++++++++++++
 src/rip_condition.py       |  49 ++++++++++++++
 src/timeout.py             |  25 +++++++
 6 files changed, 541 insertions(+)
 create mode 100644 src/algorithms.py
 create mode 100644 src/cascade_creation.py
 create mode 100644 src/convex_optimization.py
 create mode 100644 src/make_plots.py
 create mode 100644 src/rip_condition.py
 create mode 100644 src/timeout.py

(limited to 'src')

diff --git a/src/algorithms.py b/src/algorithms.py
new file mode 100644
index 0000000..0e240c9
--- /dev/null
+++ b/src/algorithms.py
@@ -0,0 +1,103 @@
+import numpy as np
+import networkx as nx
+import cascade_creation
+from collections import Counter
+from itertools import izip
+import convex_optimization
+import timeout
+
+
+def greedy_prediction(G, cascades):
+    """
+    Returns estimated graph from Greedy algorithm in "Learning Epidemic ..."
+    Only words for independent cascade model!
+    """
+    G_hat = cascade_creation.InfluenceGraph(max_proba=None)
+    G_hat.add_nodes_from(G.nodes())
+    for node in G_hat.nodes():
+        print node
+        # Avoid cases where infection time is None or 0
+        tmp = [cascade for cascade in cascades if cascade.infection_time(node)
+                                                                           [0]]
+        while tmp:
+            parents = Counter()
+            for cascade in tmp:
+                    parents += cascade.candidate_infectors(node)
+            parent = parents.most_common(1)[0][0]
+            G_hat.add_edge(parent, node)
+            tmp = [cascade for cascade in tmp if (
+                    cascade.infection_time(parent)[0] is not None and
+                    cascade.infection_time(parent)[0]+1 not in
+                        cascade.infection_time(node))]
+    return G_hat
+
+
+def recovery_l1obj_l2constraint(G, cascades, floor_cstt, passed_function,
+                                *args, **kwargs):
+    """
+    Returns estimated graph from convex program specified by passed_function
+    passed_function should have similar structure to ones in convex_optimation
+    """
+    G_hat = cascade_creation.InfluenceGraph(max_proba=None)
+    G_hat.add_nodes_from(G.nodes())
+    for node in G_hat.nodes():
+        print node
+        try:
+            M, w = cascade_creation.icc_matrixvector_for_node(cascades, node)
+            p_node, __ = passed_function(M,w, *args, **kwargs)
+            G_hat = cascade_creation.add_edges_from_proba_vector(G=G_hat,
+                        p_node=p_node, node=node, floor_cstt=floor_cstt)
+        except timeout.TimeoutError:
+            print "TimeoutError, skipping to next node"
+    return G_hat
+
+
+def correctness_measure(G, G_hat, print_values=False):
+    """
+    Measures correctness of estimated graph G_hat to ground truth G
+    """
+    edges = set(G.edges())
+    edges_hat = set(G_hat.edges())
+    fp = len(edges_hat - edges)
+    fn = len(edges - edges_hat)
+    tp = len(edges | edges_hat)
+    tn = G.number_of_nodes() ** 2 - fp - fn - tp
+
+    #Other metrics
+    precision = 1. * tp / (tp + fp)
+    recall = 1. * tp / (tp + fn)
+    f1_score = 2.* tp / (2 * tp + fp + fn)
+
+    if print_values:
+        print "False Positives: {}".format(fp)
+        print "False Negatives: {}".format(fn)
+        print "True Positives: {}".format(tp)
+        print "True Negatives: {}".format(tn)
+        print "-------------------------------"
+        print "Precision: {}".format(precision)
+        print "Recall: {}".format(recall)
+        print "F1 score: {}".format(f1_score)
+
+    return fp, fn, tp, tn
+
+
+def test():
+    """
+    unit test
+    """
+    G = cascade_creation.InfluenceGraph(max_proba = .8)
+    G.erdos_init(n = 100, p = .05)
+    import time
+    t0 = time.time()
+    A = cascade_creation.generate_cascades(G, .2, 10000)
+    if 1:
+        G_hat = greedy_prediction(G, A)
+    if 0:
+        G_hat = recovery_l1obj_l2constraint(G, A,
+                passed_function=convex_optimization.l1obj_l2penalization,
+                floor_cstt=.1, lbda=10)
+    correctness_measure(G, G_hat, print_values=True)
+
+
+if __name__=="__main__":
+    test()
diff --git a/src/cascade_creation.py b/src/cascade_creation.py
new file mode 100644
index 0000000..9a26c03
--- /dev/null
+++ b/src/cascade_creation.py
@@ -0,0 +1,162 @@
+import networkx as nx
+import numpy as np
+import collections
+from itertools import izip
+from sklearn.preprocessing import normalize
+
+class InfluenceGraph(nx.Graph):
+    """
+    networkX graph with mat and logmat attributes
+    """
+    def __init__(self, max_proba, *args, **kwargs):
+        self.max_proba = max_proba
+        super(InfluenceGraph, self).__init__(*args, **kwargs)
+
+    def erdos_init(self, n, p):
+        G = nx.erdos_renyi_graph(n, p, directed=True)
+        self.add_edges_from(G.edges())
+
+    def import_from_file(self, file_name):
+        """
+        Takes a file from the Stanford collection of networks
+        """
+        with open(file_name, 'r') as f:
+            for edge in f:
+                if "#" not in edge:
+                    u, v = [int(node) for node in edge.split()]
+                    self.add_edge(u, v)
+
+    @property
+    def mat(self):
+        if not hasattr(self, '_mat'):
+            self._mat = (self.max_proba * np.random.rand(len(self), len(self))
+                         * np.asarray(nx.adjacency_matrix(self)))
+        return self._mat
+
+    @property
+    def logmat(self):
+        if not hasattr(self, '_logmat'):
+            self._logmat = np.log(1 - self.mat)
+        return self._logmat
+
+
+class Cascade(list):
+    """
+    Cascade object: list with attributes
+    """
+    def __init__(self, *args, **kwargs):
+        super(Cascade, self).__init__(*args, **kwargs)
+
+    def infection_time(self, node):
+        """
+        Returns lists of infections times for node i in cascade
+        """
+        infected_times = []
+        for t, infected_set in izip(xrange(len(self)), self):
+            if infected_set[node]:
+                infected_times.append(t)
+        if not infected_times:
+            infected_times.append(None)
+        return infected_times
+
+    def candidate_infectors(self, node):
+        """
+        Returns Counter of nodes infected just before node i was
+        """
+        candidate_infectors = collections.Counter()
+        for t in self.infection_time(node):
+            if t:  # avoid cases where t=0 or t is None
+                candidate_infectors.update(np.where(self[t-1])[0])
+        return candidate_infectors
+
+
+def icc_cascade(G, p_init):
+    """
+    Returns boolean vectors for one cascade
+    True means node was active at that time step
+    p_init: proba that node in seed set
+    """
+    susceptible = np.ones(G.number_of_nodes(), dtype=bool)
+    active = np.random.rand(G.number_of_nodes()) < p_init
+    susceptible = susceptible & np.logical_not(active)
+    cascade = Cascade()
+    while active.any():
+        cascade.append(active)
+        active = np.exp(np.dot(G.logmat, active)) \
+                    < np.random.rand(G.number_of_nodes())
+        active = active & susceptible
+        susceptible = susceptible & np.logical_not(active)
+    if not cascade:
+        print "Empty cascade, consider changing p_init or n_nodes. Retrying."
+        return icc_cascade(G, p_init)
+    return cascade
+
+
+def generate_cascades(G, p_init, n_cascades):
+    """
+    returns list of cascades
+    """""
+    return [icc_cascade(G,p_init) for i in xrange(n_cascades)]
+
+
+def icc_matrixvector_for_node(cascades, node):
+    """
+    for the ICC model:
+    Returns M, w in matrix form where rows of M are i = t + k.T
+    Excludes all (t,k) after node infection time; w = 1_{infected}
+    """
+    #TODO: you need to remove the variable corresponding to the node
+    #you are solving for!!!!
+    if node is None:
+        return np.vstack(cascades), None
+    else:
+        w = []
+        M = []
+        for cascade in cascades:
+            t_i = cascade.infection_time(node)[0]
+            if t_i != 0:
+                indicator = np.zeros(len(cascade[:t_i]))
+                if t_i > 0:
+                    indicator[-1] = 1
+                w.append(indicator)
+                M.append(np.array(cascade[:t_i]))
+        M = np.vstack(M)
+        w = np.hstack(w)
+    return M, w
+
+
+def normalize_matrix(M):
+    """
+    returns M with normalized (L_1 norm) columns
+    """
+    return normalize(M.astype("float32"), axis=0, norm="l2")
+
+
+def add_edges_from_proba_vector(G, p_node, node, floor_cstt):
+    """
+    Takes proba vector, node and adds edges to G by flooring very small
+    probabilities
+    Also updates G's mat matrix
+    """
+    floor_parent = np.nonzero(p_node*(p_node > floor_cstt))
+    for parent in floor_parent[0]:
+        G.add_edge(parent, node)
+    #TODO: update G's mat matrix
+    return G
+
+
+def test():
+    """
+    unit test
+    """
+    G = InfluenceGraph(max_proba = .3)
+    G.erdos_init(n = 10, p = 1)
+    import time
+    t0 = time.time()
+    A = generate_cascades(G, .1, 2)
+    M, w = icc_matrixvector_for_node(A, 0)
+    t1 = time.time()
+    print t1 - t0
+
+if __name__ == "__main__":
+    test()
diff --git a/src/convex_optimization.py b/src/convex_optimization.py
new file mode 100644
index 0000000..530e3f5
--- /dev/null
+++ b/src/convex_optimization.py
@@ -0,0 +1,135 @@
+import theano
+import cascade_creation
+from theano import tensor, function
+import numpy as np
+import timeout
+import cvxopt
+
+
+@timeout.timeout(10)
+def l1obj_l2constraint(M_val, w_val):
+    """
+    Solves:
+        min - sum_j theta_j
+        s.t theta_j <= 0
+            |e^{M*theta} - (1 - w)|_2 <= 1
+    """
+    assert len(M_val) == len(w_val)
+
+    if M_val.dtype == bool:
+        M_val = M_val.astype('float32')
+
+    m, n = M_val.shape
+    c = cvxopt.matrix(-1.0, (n,1))
+
+    theta = tensor.row().T
+    z = tensor.row().T
+    theta_ = theta.flatten()
+    z_ = z.flatten()
+    M = theano.shared(M_val.astype(theano.config.floatX))
+    w = theano.shared(w_val.astype(theano.config.floatX))
+    y = (tensor.exp(M.dot(theta_)) - (1 - w)).norm(2) - 1
+    y_diff = tensor.grad(y, theta_)
+    y_hess = z[0] * theano.gradient.hessian(y, theta_)
+    f_x = theano.function([theta], [y, y_diff], allow_input_downcast=True)
+    f_xz = theano.function([theta, z], [y, y_diff, y_hess],
+                            allow_input_downcast=True)
+
+    def F(x=None, z=None):
+        if x is None:
+            return 1, cvxopt.matrix(.0001, (n,1))
+        elif z is None:
+            y, y_diff = f_x(x)
+            return cvxopt.matrix(float(y), (1, 1)),\
+                    cvxopt.matrix(y_diff.astype("float64")).T
+        else:
+            y, y_diff, y_hess = f_xz(x, z)
+            return cvxopt.matrix(float(y), (1, 1)), \
+                    cvxopt.matrix(y_diff.astype("float64")).T, \
+                    cvxopt.matrix(y_hess.astype("float64"))
+
+    G = cvxopt.spdiag([1 for i in xrange(n)])
+    h = cvxopt.matrix(0.0, (n,1))
+
+    cvxopt.solvers.options['show_progress'] = False
+    try:
+        theta = cvxopt.solvers.cpl(c,F, G, h)['x']
+    except ArithmeticError:
+        print "ArithmeticError thrown, change initial point"+\
+              " given to the solver"
+
+    return 1 - np.exp(theta), theta
+
+
+@timeout.timeout(10)
+def l1obj_l2penalization(M_val, w_val, lbda):
+    """
+    Solves:
+        min - sum_j theta_j + lbda*|e^{M*theta} - (1 - w)|_2
+        s.t theta_j <= 0
+    """
+    assert len(M_val) == len(w_val)
+
+    if M_val.dtype == bool:
+        M_val = M_val.astype('float32')
+
+    if type(lbda) == int:
+        lbda = np.array(lbda)
+
+    m, n = M_val.shape
+
+    theta = tensor.row().T
+    z = tensor.row().T
+    theta_ = theta.flatten()
+    z_ = z.flatten()
+    M = theano.shared(M_val.astype(theano.config.floatX))
+    w = theano.shared(w_val.astype(theano.config.floatX))
+    lbda = theano.shared(lbda.astype(theano.config.floatX))
+    y = (theta_).norm(1) + lbda * (
+        tensor.exp(M.dot(theta_)) - (1 - w)).norm(2)
+    y_diff = tensor.grad(y, theta_)
+    y_hess = z[0] * theano.gradient.hessian(y, theta_)
+    f_x = theano.function([theta], [y, y_diff], allow_input_downcast=True)
+    f_xz = theano.function([theta, z], [y, y_diff, y_hess],
+                           allow_input_downcast=True)
+
+    def F(x=None, z=None):
+        if x is None:
+            return 0, cvxopt.matrix(.0001, (n,1))
+        elif z is None:
+            y, y_diff = f_x(x)
+            return cvxopt.matrix(float(y), (1, 1)),\
+                   cvxopt.matrix(y_diff.astype("float64")).T
+        else:
+            y, y_diff, y_hess = f_xz(x, z)
+            return cvxopt.matrix(float(y), (1, 1)), \
+                    cvxopt.matrix(y_diff.astype("float64")).T, \
+                    cvxopt.matrix(y_hess.astype("float64"))
+
+    G = cvxopt.spdiag([1 for i in xrange(n)])
+    h = cvxopt.matrix(0.0, (n,1))
+
+    cvxopt.solvers.options['show_progress'] = False
+    try:
+        theta = cvxopt.solvers.cp(F, G, h)['x']
+    except ArithmeticError:
+        print "ArithmeticError thrown, change initial point"+\
+              " given to the solver"
+
+    return 1 - np.exp(theta), theta
+
+
+def test():
+    """
+    unit test
+    """
+    lbda = 10
+    G = cascade_creation.InfluenceGraph(max_proba=.8)
+    G.erdos_init(n=100, p = .1)
+    A = cascade_creation.generate_cascades(G, .1, 2000)
+    M_val, w_val = cascade_creation.icc_matrixvector_for_node(A, 0)
+    p_vec, theta = l1obj_l2penalization(M_val, w_val, lbda)
+    print p_vec
+
+if __name__=="__main__":
+    test()
diff --git a/src/make_plots.py b/src/make_plots.py
new file mode 100644
index 0000000..9b1fdf4
--- /dev/null
+++ b/src/make_plots.py
@@ -0,0 +1,67 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import cascade_creation
+import convex_optimization
+import algorithms
+import rip_condition
+
+
+def plot_rip_numberofnodes(max_proba, n_min, n_max, p_init, n_cascades, K_max):
+    """
+    Plots the RIP constant for varying number of nodes (n_max included)
+    """
+    x = np.arange(n_min, n_max+1)
+    y = []
+
+    for n_nodes in x:
+        print n_nodes
+        G = cascade_creation.InfluenceGraph(max_proba=.3)
+        G.erdos_init(n=n_nodes, p=.1) #TODO: handle different inits!
+        cascades = cascade_creation.generate_cascades(G, p_init=p_init,
+                                                      n_cascades=n_cascades)
+        M, __ = cascade_creation.icc_matrixvector_for_node(cascades, None)
+        M = cascade_creation.normalize_matrix(M)
+        y.append(rip_condition.find_kth_rip_constants(M, 4)) #
+
+    print y
+
+    plt.clf()
+    plt.plot(x, y)
+    #plt.show()
+
+    return x, y
+
+
+def compare_greedy_and_lagrange_cs284r():
+    """
+    Compares the performance of the greedy algorithm on the
+    lagrangian sparse recovery objective on the Facebook dataset
+    for the CS284r project
+    """
+    G = cascade_creation.InfluenceGraph(max_proba = .8)
+    G.import_from_file("subset_facebook_SNAP.txt")
+    A = cascade_creation.generate_cascades(G, p_init=.05, n_cascades=100)
+
+    #Greedy
+    G_hat = algorithms.greedy_prediction(G, A)
+    algorithms.correctness_measure(G, G_hat, print_values=True)
+
+    #Lagrange Objective
+    G_hat = algorithms.recovery_l1obj_l2constraint(G, A,
+            passed_function=convex_optimization.l1obj_l2penalization,
+            floor_cstt=.1, lbda=10)
+    algorithms.correctness_measure(G, G_hat, print_values=True)
+
+
+def test():
+    """
+    unit test
+    """
+    if 0:
+        plot_rip_numberofnodes(max_proba=.3, n_min=30, n_max=30,
+                            p_init=.01, n_cascades=100, K_max=4)
+    if 1:
+        compare_greedy_and_lagrange_cs284r()
+
+if __name__=="__main__":
+    test()
diff --git a/src/rip_condition.py b/src/rip_condition.py
new file mode 100644
index 0000000..fccf9a4
--- /dev/null
+++ b/src/rip_condition.py
@@ -0,0 +1,49 @@
+import numpy as np
+import cascade_creation
+import itertools
+from scipy import sparse
+from scipy.sparse import linalg
+
+def find_kth_rip_constants(M, k):
+    """
+    Returns max(A_1, A_2) where:
+    1 + A_1 = arg max |Mx|_2^2 s.t. |x|_2 = 1 and |x|_0 = k
+    1 - A_2 = arg min |Mx|_2^2 s.t. |x|_2 = 1 and |x|_0 = kx
+    """
+    delta = 0
+    print M.shape
+    for col_set in itertools.combinations(xrange(M.shape[1]), k):
+        M_kcol = M[:,list(col_set)]
+        delta_upper, delta_lower = upperlower_bound_rip(M_kcol)
+        delta = max(delta, max(delta_upper, delta_lower))
+    return delta
+
+
+def upperlower_bound_rip(M):
+    """
+    returns arg max/min |Mx|_2^2 s.t. |x|_2 = 1
+    which is the greatest eigenvalue value of M^T*M
+    or the square of the greatest singular value of M
+    """
+    M = sparse.csc_matrix(M)
+    s_upper = linalg.svds(M, 1, tol=.01, which ='LM', maxiter = 2000,
+                                return_singular_vectors=False)
+    s_lower = linalg.svds(M, 1, tol=.01, which = 'SM', maxiter= 2000,
+                                return_singular_vectors=False)
+    return s_upper ** 2 - 1, 1 -s_lower ** 2
+
+
+def test():
+    """
+    unit test
+    """
+    G = cascade_creation.InfluenceGraph(max_proba=.3)
+    G.erdos_init(n=10, p =1)
+    cascades = cascade_creation.generate_cascades(G, p_init=.3, n_cascades=10)
+    M, __ = cascade_creation.icc_matrixvector_for_node(cascades, None)
+    M = cascade_creation.normalize_matrix(M)
+    print find_kth_rip_constants(M, 5)
+
+
+if __name__=="__main__":
+    test()
diff --git a/src/timeout.py b/src/timeout.py
new file mode 100644
index 0000000..d7381c3
--- /dev/null
+++ b/src/timeout.py
@@ -0,0 +1,25 @@
+from functools import wraps
+import errno
+import os
+import signal
+
+class TimeoutError(Exception):
+    pass
+
+def timeout(seconds=10, error_message=os.strerror(errno.ETIME)):
+    def decorator(func):
+        def _handle_timeout(signum, frame):
+            raise TimeoutError(error_message)
+
+        def wrapper(*args, **kwargs):
+            signal.signal(signal.SIGALRM, _handle_timeout)
+            signal.alarm(seconds)
+            try:
+                result = func(*args, **kwargs)
+            finally:
+                signal.alarm(0)
+            return result
+
+        return wraps(func)(wrapper)
+
+    return decorator
-- 
cgit v1.2.3-70-g09d2