renaming directory + creating dataset directory

author: jeanpouget-abadie <jean.pougetabadie@gmail.com> 2014-12-07 12:08:31 -0500
committer: jeanpouget-abadie <jean.pougetabadie@gmail.com> 2014-12-07 12:08:31 -0500
commit: 9de35421f25bf45158187daea4ddfedd1c93f3d8 (patch)
tree: f917008b6363a2b9dbff7855781f4fd5a10a6e94 /src/cascade_creation.py
parent: 6c874852773329f6fecbbc54476b30a37aa85b79 (diff)
download: cascades-9de35421f25bf45158187daea4ddfedd1c93f3d8.tar.gz
1 files changed, 162 insertions, 0 deletions
diff --git a/src/cascade_creation.py b/src/cascade_creation.py
new file mode 100644
index 0000000..9a26c03
--- /dev/null
+++ b/src/cascade_creation.py
@@ -0,0 +1,162 @@
+import networkx as nx
+import numpy as np
+import collections
+from itertools import izip
+from sklearn.preprocessing import normalize
+
+class InfluenceGraph(nx.Graph):
+    """
+    networkX graph with mat and logmat attributes
+    """
+    def __init__(self, max_proba, *args, **kwargs):
+        self.max_proba = max_proba
+        super(InfluenceGraph, self).__init__(*args, **kwargs)
+
+    def erdos_init(self, n, p):
+        G = nx.erdos_renyi_graph(n, p, directed=True)
+        self.add_edges_from(G.edges())
+
+    def import_from_file(self, file_name):
+        """
+        Takes a file from the Stanford collection of networks
+        """
+        with open(file_name, 'r') as f:
+            for edge in f:
+                if "#" not in edge:
+                    u, v = [int(node) for node in edge.split()]
+                    self.add_edge(u, v)
+
+    @property
+    def mat(self):
+        if not hasattr(self, '_mat'):
+            self._mat = (self.max_proba * np.random.rand(len(self), len(self))
+                         * np.asarray(nx.adjacency_matrix(self)))
+        return self._mat
+
+    @property
+    def logmat(self):
+        if not hasattr(self, '_logmat'):
+            self._logmat = np.log(1 - self.mat)
+        return self._logmat
+
+
+class Cascade(list):
+    """
+    Cascade object: list with attributes
+    """
+    def __init__(self, *args, **kwargs):
+        super(Cascade, self).__init__(*args, **kwargs)
+
+    def infection_time(self, node):
+        """
+        Returns lists of infections times for node i in cascade
+        """
+        infected_times = []
+        for t, infected_set in izip(xrange(len(self)), self):
+            if infected_set[node]:
+                infected_times.append(t)
+        if not infected_times:
+            infected_times.append(None)
+        return infected_times
+
+    def candidate_infectors(self, node):
+        """
+        Returns Counter of nodes infected just before node i was
+        """
+        candidate_infectors = collections.Counter()
+        for t in self.infection_time(node):
+            if t:  # avoid cases where t=0 or t is None
+                candidate_infectors.update(np.where(self[t-1])[0])
+        return candidate_infectors
+
+
+def icc_cascade(G, p_init):
+    """
+    Returns boolean vectors for one cascade
+    True means node was active at that time step
+    p_init: proba that node in seed set
+    """
+    susceptible = np.ones(G.number_of_nodes(), dtype=bool)
+    active = np.random.rand(G.number_of_nodes()) < p_init
+    susceptible = susceptible & np.logical_not(active)
+    cascade = Cascade()
+    while active.any():
+        cascade.append(active)
+        active = np.exp(np.dot(G.logmat, active)) \
+                    < np.random.rand(G.number_of_nodes())
+        active = active & susceptible
+        susceptible = susceptible & np.logical_not(active)
+    if not cascade:
+        print "Empty cascade, consider changing p_init or n_nodes. Retrying."
+        return icc_cascade(G, p_init)
+    return cascade
+
+
+def generate_cascades(G, p_init, n_cascades):
+    """
+    returns list of cascades
+    """""
+    return [icc_cascade(G,p_init) for i in xrange(n_cascades)]
+
+
+def icc_matrixvector_for_node(cascades, node):
+    """
+    for the ICC model:
+    Returns M, w in matrix form where rows of M are i = t + k.T
+    Excludes all (t,k) after node infection time; w = 1_{infected}
+    """
+    #TODO: you need to remove the variable corresponding to the node
+    #you are solving for!!!!
+    if node is None:
+        return np.vstack(cascades), None
+    else:
+        w = []
+        M = []
+        for cascade in cascades:
+            t_i = cascade.infection_time(node)[0]
+            if t_i != 0:
+                indicator = np.zeros(len(cascade[:t_i]))
+                if t_i > 0:
+                    indicator[-1] = 1
+                w.append(indicator)
+                M.append(np.array(cascade[:t_i]))
+        M = np.vstack(M)
+        w = np.hstack(w)
+    return M, w
+
+
+def normalize_matrix(M):
+    """
+    returns M with normalized (L_1 norm) columns
+    """
+    return normalize(M.astype("float32"), axis=0, norm="l2")
+
+
+def add_edges_from_proba_vector(G, p_node, node, floor_cstt):
+    """
+    Takes proba vector, node and adds edges to G by flooring very small
+    probabilities
+    Also updates G's mat matrix
+    """
+    floor_parent = np.nonzero(p_node*(p_node > floor_cstt))
+    for parent in floor_parent[0]:
+        G.add_edge(parent, node)
+    #TODO: update G's mat matrix
+    return G
+
+
+def test():
+    """
+    unit test
+    """
+    G = InfluenceGraph(max_proba = .3)
+    G.erdos_init(n = 10, p = 1)
+    import time
+    t0 = time.time()
+    A = generate_cascades(G, .1, 2)
+    M, w = icc_matrixvector_for_node(A, 0)
+    t1 = time.time()
+    print t1 - t0
+
+if __name__ == "__main__":
+    test()
author	jeanpouget-abadie <jean.pougetabadie@gmail.com>	2014-12-07 12:08:31 -0500
committer	jeanpouget-abadie <jean.pougetabadie@gmail.com>	2014-12-07 12:08:31 -0500
commit	9de35421f25bf45158187daea4ddfedd1c93f3d8 (patch)
tree	f917008b6363a2b9dbff7855781f4fd5a10a6e94 /src/cascade_creation.py
parent	6c874852773329f6fecbbc54476b30a37aa85b79 (diff)
download	cascades-9de35421f25bf45158187daea4ddfedd1c93f3d8.tar.gz