From f1762904c648b2089031ba6ce46ccaaac4f3514c Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Mon, 30 Nov 2015 19:57:58 -0500
Subject: Big code cleanup

---
 simulation/active_blocks.py |  50 ++++++++-----------
 simulation/bayes.py         | 117 --------------------------------------------
 simulation/main.py          | 101 --------------------------------------
 simulation/mcmc.py          | 117 ++++++++++++++++++++++++++++++++++++++++++++
 simulation/mle.py           |  72 +++++++++++++++++++++++++++
 simulation/mleNode.py       |  72 ---------------------------
 simulation/utils.py         |  64 ++++++++++++++++++++++++
 7 files changed, 273 insertions(+), 320 deletions(-)
 delete mode 100644 simulation/bayes.py
 delete mode 100644 simulation/main.py
 create mode 100644 simulation/mcmc.py
 create mode 100644 simulation/mle.py
 delete mode 100644 simulation/mleNode.py
 create mode 100644 simulation/utils.py

(limited to 'simulation')

diff --git a/simulation/active_blocks.py b/simulation/active_blocks.py
index 569cb6c..a1f6e76 100644
--- a/simulation/active_blocks.py
+++ b/simulation/active_blocks.py
@@ -1,11 +1,12 @@
-import main as mn
+import utils
 import theano
 from theano import tensor as tsr
 import blocks
-import blocks.algorithms, blocks.main_loop, blocks.extensions.monitoring
+from blocks import algorithms, main_loop
+import blocks.extensions as be
+import blocks.extensions.monitoring as bm
 import picklable_itertools
 import numpy as np
-from six.moves import range
 import fuel
 import fuel.datasets
 import collections
@@ -19,28 +20,16 @@ class LearnedDataset(fuel.datasets.Dataset):
     """
     provides_sources = ('x', 's')
 
-    def __init__(self, node_p, graph, source=mn.var_source, **kwargs):
+    def __init__(self, node_p, graph, **kwargs):
         super(LearnedDataset, self).__init__(**kwargs)
         self.node_p = node_p
         self.graph = graph
-        self.n_cascades = 1  # nbr of cascades of total size approx = request
-        self.source = lambda graph, t : source(graph, t, self.node_p)
+        self.source = lambda graph: utils.random_source(graph, self.node_p)
 
     def get_data(self, state=None, request=None):
-        floatX = 'int8'
-        x_obs = np.empty((request, len(self.graph)), dtype=floatX)
-        s_obs = np.empty((request, len(self.graph)), dtype=floatX)
-        i = 0
-        while i < request:
-            x_tmp, s_tmp = mn.build_cascade_list(
-                mn.simulate_cascades(self.n_cascades, self.graph, self.source),
-                collapse=True
-            )
-            x_obs[i:i + len(x_tmp)] = x_tmp[:request - i]
-            s_obs[i:i + len(x_tmp)] = s_tmp[:request - i]
-            i += len(x_tmp)
-            self.n_cascades += 1  # learn optimal nbr in loop
-        self.n_cascades = max(1, self.n_cascades - 2)
+        # floatX = 'int8'
+        x_obs, s_obs = utils.simulate_cascades(request, self.graph, self.source)
+
         return (x_obs, s_obs)
 
 
@@ -115,8 +104,9 @@ def create_fixed_data_stream(n_cascades, graph, batch_size, shuffle=True):
         -shuffle (bool): shuffle minibatches but not within minibatch, else
         sequential (non-shuffled) batches are used
     """
-    cascades = mn.build_cascade_list(mn.simulate_cascades(n_cascades, graph),
-                                     collapse=True)
+    cascades = utils.build_cascade_list(
+        utils.simulate_cascades(n_cascades, graph),
+        collapse=True)
     x_obs, s_obs = cascades[0], cascades[1]
     data_set = fuel.datasets.base.IndexableDataset(collections.OrderedDict(
         [('x', x_obs), ('s', s_obs)]
@@ -138,23 +128,23 @@ def create_learned_data_stream(graph, batch_size):
 
 if __name__ == "__main__":
     batch_size = 1000
-    graph = mn.create_star(1000)
+    graph = utils.create_wheel(1000)
     print('GRAPH:\n', graph, '\n-------------\n')
 
     x, s, params, cost = create_mle_model(graph)
     rmse, g_shared = rmse_error(graph, params)
 
-    alg = blocks.algorithms.GradientDescent(
-       cost=-cost, parameters=[params], step_rule=blocks.algorithms.AdaDelta()
+    alg = algorithms.GradientDescent(
+        cost=-cost, parameters=[params], step_rule=blocks.algorithms.AdaDelta()
     )
     data_stream = create_learned_data_stream(graph, batch_size)
-    loop = blocks.main_loop.MainLoop(
+    loop = main_loop.MainLoop(
         alg, data_stream,
         extensions=[
-            blocks.extensions.FinishAfter(after_n_batches = 10**4),
-            blocks.extensions.monitoring.TrainingDataMonitoring([cost, params,
-                rmse, g_shared], after_batch=True),
-            blocks.extensions.Printing(every_n_batches = 10),
+            be.FinishAfter(after_n_batches=10**4),
+            bm.TrainingDataMonitoring([cost, params,
+                                       rmse, g_shared], after_batch=True),
+            be.Printing(every_n_batches=10),
             ActiveLearning(data_stream.dataset),
         ]
     )
diff --git a/simulation/bayes.py b/simulation/bayes.py
deleted file mode 100644
index bde9e94..0000000
--- a/simulation/bayes.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import pymc
-import numpy as np
-
-def glm_node_setup(cascade, y_obs, prior=None, *args, **kwargs):
-    """
-    Build an IC PyMC node-level model from:
-        -observed cascades: cascade
-        -outcome vector: y_obs
-        -desired PyMC prior and parameters: prior, *args
-    Note: we use the glm formulation: y = Bernoulli[f(x.dot(theta))]
-    """
-    n_nodes = len(cascade[0])
-
-    # Container class for node's parents
-    theta = np.empty(n_nodes, dtype=object)
-    for j in xrange(n_nodes):
-        if prior is None:
-            theta[j] = pymc.Beta('theta_{}'.format(j), alpha=1, beta=1)
-        else:
-            theta[j] = prior('theta_{}'.format(j), *args, **kwargs)
-
-    # Observed container class for cascade realization
-    x = np.empty(n_nodes, dtype=object)
-    for i, val in enumerate(cascade.T):
-        x[i] = pymc.Normal('x_{}'.format(i), 0, 1, value=val, observed=True)
-
-    @pymc.deterministic
-    def glm_p(x=x, theta=theta):
-        return 1. - np.exp(-x.dot(theta))
-
-    @pymc.observed
-    def y(glm_p=glm_p, value=y_obs):
-        return pymc.bernoulli_like(value, glm_p)
-
-    return pymc.Model([y, pymc.Container(theta), pymc.Container(x)])
-
-
-def formatLabel(s, n):
-    return '0'*(len(str(n)) - len(str(s))) + str(s)
-
-
-def mc_graph_setup(infected, susceptible, prior=None, *args, **kwargs):
-    """
-    Build an IC PyMC graph-level model from:
-        -infected nodes over time: list/tuple of list/tuple of np.array
-        -susceptible nodes over time: same format as above
-    Note: we use the Markov Chain formulation: X_{t+1}|X_t,theta = f(X_t.theta)
-    """
-
-    # Container class for graph parameters
-    n_nodes = len(infected[0][0])
-    theta = np.empty((n_nodes,n_nodes), dtype=object)
-    if prior is None:
-        for i in xrange(n_nodes):
-            for j in xrange(n_nodes):
-                theta[i, j] = pymc.Beta('theta_{}{}'.format(formatLabel(i,
-                                        n_nodes-1), formatLabel(j, n_nodes-1)),
-                                        alpha=1, beta=1)
-    else:
-        theta = prior(theta=theta, *args, **kwargs)
-
-    # Container class for cascade realization
-    x = {}
-    for i, cascade in enumerate(infected):
-        for j, step in enumerate(cascade):
-            for k, node in enumerate(step):
-                if j and susceptible[i][j][k]:
-                    p = 1. - pymc.exp(-cascade[j-1].dot(theta[k]))
-                else:
-                    p = .5
-                x[i, j, k] = pymc.Bernoulli('x_{}{}{}'.format(i, j, k), p=p,
-                        value=node, observed=True)
-
-    return pymc.Model([pymc.Container(theta), pymc.Container(x)])
-
-
-if __name__=="__main__":
-    import main
-    import matplotlib.pyplot as plt
-    import seaborn
-    seaborn.set_style('whitegrid')
-    g = np.array([[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]])
-    p = 0.5
-    g = np.log(1. / (1 - p * g))
-
-    print('running the graph-level MC set-up')
-    n_nodes = len(g)
-    cascades = main.simulate_cascades(1000, g)
-    infected, susc = main.build_cascade_list(cascades)
-    model = mc_graph_setup(infected, susc)
-    mcmc = pymc.MCMC(model)
-    mcmc.sample(10**4, 1000)
-    fig, ax = plt.subplots(len(g), len(g))
-    for i in xrange(n_nodes):
-        for j in xrange(n_nodes):
-            if n_nodes < 5:
-                ax[i,j].locator_params(nbins=3, axis='x')
-            else:
-                ax[i, j].get_xaxis().set_ticks([])
-                ax[i, j].get_yaxis().set_ticks([])
-            it, jt = formatLabel(i, n_nodes-1), formatLabel(j, n_nodes-1)
-            ax[i,j].hist(mcmc.trace('theta_{}{}'.format(it,jt))[:], normed=True)
-            ax[i, j].set_xlim([0,1])
-            ax[i, j].plot([g[i, j]]*2, [0, ax[i,j].get_ylim()[-1]], color='red')
-    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=.1)
-    plt.show()
-
-    print('running the node level set-up')
-    node = 0
-    cascades = main.simulate_cascades(100, g)
-    cascade, y_obs = main.build_matrix(cascades, node)
-    model = glm_node_setup(cascade, y_obs)
-    mcmc = pymc.MCMC(model)
-    mcmc.sample(1e5, 1e4)
-    plt.hist(mcmc.trace('theta_1')[:], bins=1e2)
-    plt.show()
-
diff --git a/simulation/main.py b/simulation/main.py
deleted file mode 100644
index 81133c7..0000000
--- a/simulation/main.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import mleNode as mn
-
-import numpy as np
-from numpy.linalg import norm
-import numpy.random as nr
-from scipy.optimize import minimize
-import matplotlib.pyplot as plt
-import seaborn
-from random import random, randint
-from six.moves import range
-
-seaborn.set_style("white")
-
-
-def create_random_graph(n_nodes, p=.5):
-    graph = .5 * np.random.binomial(2, p=.5, size=(n_nodes, n_nodes))
-    for k in range(len(graph)):
-        graph[k, k] = 0
-    return np.log(1. / (1 - p * graph))
-
-
-def create_star(n_nodes, p=.5):
-    graph = np.zeros((n_nodes, n_nodes))
-    graph[0] = np.ones((n_nodes,))
-    graph[0, 0] = 0
-    for index, row in enumerate(graph[1:-1]):
-        row[index + 1] = 1
-    graph[-1, 1] = 1
-    return np.log(1. / (1 - p * graph))
-
-
-def simulate_cascade(x, graph):
-    """
-    Simulate an IC cascade given a graph and initial state.
-
-    For each time step we yield:
-        - susc: the nodes susceptible at the beginning of this time step
-        - x: the subset of susc who became infected
-    """
-    yield x, np.zeros(graph.shape[0], dtype=bool)
-    susc = np.ones(graph.shape[0], dtype=bool)
-    while np.any(x):
-        susc = susc ^ x  # nodes infected at previous step are now inactive
-        if not np.any(susc):
-            break
-        x = 1 - np.exp(-np.dot(graph.T, x))
-        y = nr.random(x.shape[0])
-        x = (x >= y) & susc
-        yield x, susc
-
-
-def uniform_source(graph, *args, **kwargs):
-    x0 = np.zeros(graph.shape[0], dtype=bool)
-    x0[nr.randint(0, graph.shape[0])] = True
-    return x0
-
-
-def var_source(graph, t, node_p=None, *args, **kwargs):
-    if node_p is None:
-        node_p = np.ones(len(graph)) / len(graph)
-    x0 = np.zeros(graph.shape[0], dtype=bool)
-    x0[nr.choice(a=len(graph), p=node_p)] = True
-    return x0
-
-
-def simulate_cascades(n, graph, source=uniform_source):
-    for t in range(n):
-        x0 = source(graph, t)
-        yield simulate_cascade(x0, graph)
-
-
-def build_cascade_list(cascades, collapse=False):
-    x, s = [], []
-    for cascade in cascades:
-        xlist, slist = zip(*cascade)
-        x.append(np.vstack(xlist))
-        s.append(np.vstack(slist))
-    if not collapse:
-        return x, s
-    else:
-        return np.vstack(x), np.vstack(s)
-
-
-if __name__ == "__main__":
-    #g = np.array([[0, 0, 1], [0, 0, 0.5], [0, 0, 0]])
-    #p = 0.5
-    #g = np.log(1. / (1 - p * g))
-    g = create_random_graph(n_nodes=3)
-    print(g)
-    sizes = [10**3]
-    for si in sizes:
-        cascades = simulate_cascades(si, g)
-        cascade, y_obs = mn.build_matrix(cascades, 2)
-        print(mn.infer(cascade, y_obs))
-        #conf = mn.bootstrap(cascade, y_obs, n_iter=100)
-        #estimand = np.linalg.norm(np.delete(conf - g[0], 0, axis=1), axis=1)
-        #plt.hist(estimand, bins=40)
-        #plt.show()
-        #error.append(mn.confidence_interval(*np.histogram(estimand, bins=50)))
-    #plt.plot(sizes, error)
-    #plt.show()
diff --git a/simulation/mcmc.py b/simulation/mcmc.py
new file mode 100644
index 0000000..bde9e94
--- /dev/null
+++ b/simulation/mcmc.py
@@ -0,0 +1,117 @@
+import pymc
+import numpy as np
+
+def glm_node_setup(cascade, y_obs, prior=None, *args, **kwargs):
+    """
+    Build an IC PyMC node-level model from:
+        -observed cascades: cascade
+        -outcome vector: y_obs
+        -desired PyMC prior and parameters: prior, *args
+    Note: we use the glm formulation: y = Bernoulli[f(x.dot(theta))]
+    """
+    n_nodes = len(cascade[0])
+
+    # Container class for node's parents
+    theta = np.empty(n_nodes, dtype=object)
+    for j in xrange(n_nodes):
+        if prior is None:
+            theta[j] = pymc.Beta('theta_{}'.format(j), alpha=1, beta=1)
+        else:
+            theta[j] = prior('theta_{}'.format(j), *args, **kwargs)
+
+    # Observed container class for cascade realization
+    x = np.empty(n_nodes, dtype=object)
+    for i, val in enumerate(cascade.T):
+        x[i] = pymc.Normal('x_{}'.format(i), 0, 1, value=val, observed=True)
+
+    @pymc.deterministic
+    def glm_p(x=x, theta=theta):
+        return 1. - np.exp(-x.dot(theta))
+
+    @pymc.observed
+    def y(glm_p=glm_p, value=y_obs):
+        return pymc.bernoulli_like(value, glm_p)
+
+    return pymc.Model([y, pymc.Container(theta), pymc.Container(x)])
+
+
+def formatLabel(s, n):
+    return '0'*(len(str(n)) - len(str(s))) + str(s)
+
+
+def mc_graph_setup(infected, susceptible, prior=None, *args, **kwargs):
+    """
+    Build an IC PyMC graph-level model from:
+        -infected nodes over time: list/tuple of list/tuple of np.array
+        -susceptible nodes over time: same format as above
+    Note: we use the Markov Chain formulation: X_{t+1}|X_t,theta = f(X_t.theta)
+    """
+
+    # Container class for graph parameters
+    n_nodes = len(infected[0][0])
+    theta = np.empty((n_nodes,n_nodes), dtype=object)
+    if prior is None:
+        for i in xrange(n_nodes):
+            for j in xrange(n_nodes):
+                theta[i, j] = pymc.Beta('theta_{}{}'.format(formatLabel(i,
+                                        n_nodes-1), formatLabel(j, n_nodes-1)),
+                                        alpha=1, beta=1)
+    else:
+        theta = prior(theta=theta, *args, **kwargs)
+
+    # Container class for cascade realization
+    x = {}
+    for i, cascade in enumerate(infected):
+        for j, step in enumerate(cascade):
+            for k, node in enumerate(step):
+                if j and susceptible[i][j][k]:
+                    p = 1. - pymc.exp(-cascade[j-1].dot(theta[k]))
+                else:
+                    p = .5
+                x[i, j, k] = pymc.Bernoulli('x_{}{}{}'.format(i, j, k), p=p,
+                        value=node, observed=True)
+
+    return pymc.Model([pymc.Container(theta), pymc.Container(x)])
+
+
+if __name__=="__main__":
+    import main
+    import matplotlib.pyplot as plt
+    import seaborn
+    seaborn.set_style('whitegrid')
+    g = np.array([[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]])
+    p = 0.5
+    g = np.log(1. / (1 - p * g))
+
+    print('running the graph-level MC set-up')
+    n_nodes = len(g)
+    cascades = main.simulate_cascades(1000, g)
+    infected, susc = main.build_cascade_list(cascades)
+    model = mc_graph_setup(infected, susc)
+    mcmc = pymc.MCMC(model)
+    mcmc.sample(10**4, 1000)
+    fig, ax = plt.subplots(len(g), len(g))
+    for i in xrange(n_nodes):
+        for j in xrange(n_nodes):
+            if n_nodes < 5:
+                ax[i,j].locator_params(nbins=3, axis='x')
+            else:
+                ax[i, j].get_xaxis().set_ticks([])
+                ax[i, j].get_yaxis().set_ticks([])
+            it, jt = formatLabel(i, n_nodes-1), formatLabel(j, n_nodes-1)
+            ax[i,j].hist(mcmc.trace('theta_{}{}'.format(it,jt))[:], normed=True)
+            ax[i, j].set_xlim([0,1])
+            ax[i, j].plot([g[i, j]]*2, [0, ax[i,j].get_ylim()[-1]], color='red')
+    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=.1)
+    plt.show()
+
+    print('running the node level set-up')
+    node = 0
+    cascades = main.simulate_cascades(100, g)
+    cascade, y_obs = main.build_matrix(cascades, node)
+    model = glm_node_setup(cascade, y_obs)
+    mcmc = pymc.MCMC(model)
+    mcmc.sample(1e5, 1e4)
+    plt.hist(mcmc.trace('theta_1')[:], bins=1e2)
+    plt.show()
+
diff --git a/simulation/mle.py b/simulation/mle.py
new file mode 100644
index 0000000..c6b2e85
--- /dev/null
+++ b/simulation/mle.py
@@ -0,0 +1,72 @@
+import numpy as np
+from scipy.optimize import minimize
+
+
+def likelihood(p, x, y):
+    a = np.dot(x, p)
+    return np.log(1. - np.exp(-a[y])).sum() - a[~y].sum()
+
+
+def likelihood_gradient(p, x, y):
+    a = np.dot(x, p)
+    l = np.log(1. - np.exp(-a[y])).sum() - a[~y].sum()
+    g1 = 1. / (np.exp(a[y]) - 1.)
+    g = (x[y] * g1[:, np.newaxis]).sum(0) - x[~y].sum(0)
+    return l, g
+
+
+def test_gradient(x, y):
+    eps = 1e-10
+    for i in xrange(x.shape[1]):
+        p = 0.5 * np.ones(x.shape[1])
+        a = np.dot(x, p)
+        g1 = 1. / (np.exp(a[y]) - 1.)
+        g = (x[y] * g1[:, np.newaxis]).sum(0) - x[~y].sum(0)
+        p[i] += eps
+        f1 = likelihood(p, x, y)
+        p[i] -= 2 * eps
+        f2 = likelihood(p, x, y)
+        print(g[i], (f1 - f2) / (2 * eps))
+
+
+def infer(x, y):
+    def f(p):
+        l, g = likelihood_gradient(p, x, y)
+        return -l, -g
+    x0 = np.ones(x.shape[1])
+    bounds = [(1e-10, None)] * x.shape[1]
+    return minimize(f, x0, jac=True, bounds=bounds, method="L-BFGS-B").x
+
+
+def bootstrap(x, y, n_iter=100):
+    rval = np.zeros((n_iter, x.shape[1]))
+    for i in xrange(n_iter):
+        indices = np.random.choice(len(y), replace=False, size=int(len(y)*.9))
+        rval[i] = infer(x[indices], y[indices])
+    return rval
+
+
+def confidence_interval(counts, bins):
+    k = 0
+    while np.sum(counts[len(counts)/2-k:len(counts)/2+k]) <= .95*np.sum(counts):
+        k += 1
+    return bins[len(bins)/2-k], bins[len(bins)/2+k]
+
+
+def build_matrix(cascades, node):
+
+    def aux(cascade, node):
+        xlist, slist = zip(*cascade)
+        indices = [i for i, s in enumerate(slist) if s[node] and i >= 1]
+        if indices:
+            x = np.vstack(xlist[i-1] for i in indices)
+            y = np.array([xlist[i][node] for i in indices])
+            return x, y
+        else:
+            return None
+
+    pairs = (aux(cascade, node) for cascade in cascades)
+    xs, ys = zip(*(pair for pair in pairs if pair))
+    x = np.vstack(xs)
+    y = np.concatenate(ys)
+    return x, y
diff --git a/simulation/mleNode.py b/simulation/mleNode.py
deleted file mode 100644
index c6b2e85..0000000
--- a/simulation/mleNode.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import numpy as np
-from scipy.optimize import minimize
-
-
-def likelihood(p, x, y):
-    a = np.dot(x, p)
-    return np.log(1. - np.exp(-a[y])).sum() - a[~y].sum()
-
-
-def likelihood_gradient(p, x, y):
-    a = np.dot(x, p)
-    l = np.log(1. - np.exp(-a[y])).sum() - a[~y].sum()
-    g1 = 1. / (np.exp(a[y]) - 1.)
-    g = (x[y] * g1[:, np.newaxis]).sum(0) - x[~y].sum(0)
-    return l, g
-
-
-def test_gradient(x, y):
-    eps = 1e-10
-    for i in xrange(x.shape[1]):
-        p = 0.5 * np.ones(x.shape[1])
-        a = np.dot(x, p)
-        g1 = 1. / (np.exp(a[y]) - 1.)
-        g = (x[y] * g1[:, np.newaxis]).sum(0) - x[~y].sum(0)
-        p[i] += eps
-        f1 = likelihood(p, x, y)
-        p[i] -= 2 * eps
-        f2 = likelihood(p, x, y)
-        print(g[i], (f1 - f2) / (2 * eps))
-
-
-def infer(x, y):
-    def f(p):
-        l, g = likelihood_gradient(p, x, y)
-        return -l, -g
-    x0 = np.ones(x.shape[1])
-    bounds = [(1e-10, None)] * x.shape[1]
-    return minimize(f, x0, jac=True, bounds=bounds, method="L-BFGS-B").x
-
-
-def bootstrap(x, y, n_iter=100):
-    rval = np.zeros((n_iter, x.shape[1]))
-    for i in xrange(n_iter):
-        indices = np.random.choice(len(y), replace=False, size=int(len(y)*.9))
-        rval[i] = infer(x[indices], y[indices])
-    return rval
-
-
-def confidence_interval(counts, bins):
-    k = 0
-    while np.sum(counts[len(counts)/2-k:len(counts)/2+k]) <= .95*np.sum(counts):
-        k += 1
-    return bins[len(bins)/2-k], bins[len(bins)/2+k]
-
-
-def build_matrix(cascades, node):
-
-    def aux(cascade, node):
-        xlist, slist = zip(*cascade)
-        indices = [i for i, s in enumerate(slist) if s[node] and i >= 1]
-        if indices:
-            x = np.vstack(xlist[i-1] for i in indices)
-            y = np.array([xlist[i][node] for i in indices])
-            return x, y
-        else:
-            return None
-
-    pairs = (aux(cascade, node) for cascade in cascades)
-    xs, ys = zip(*(pair for pair in pairs if pair))
-    x = np.vstack(xs)
-    y = np.concatenate(ys)
-    return x, y
diff --git a/simulation/utils.py b/simulation/utils.py
new file mode 100644
index 0000000..aad7771
--- /dev/null
+++ b/simulation/utils.py
@@ -0,0 +1,64 @@
+import numpy as np
+import numpy.random as nr
+from six.moves import range
+
+
+def create_random_graph(n_nodes, p=.5):
+    graph = .5 * np.random.binomial(2, p=.5, size=(n_nodes, n_nodes))
+    for k in range(len(graph)):
+        graph[k, k] = 0
+    return np.log(1. / (1 - p * graph))
+
+
+def create_wheel(n_nodes, p=.5):
+    graph = np.zeros((n_nodes, n_nodes))
+    graph[0] = np.ones(n_nodes)
+    graph[0, 0] = 0
+    for i in range(1, n_nodes-1):
+        graph[i, i + 1] = 1
+    graph[n_nodes-1, 1] = 1
+    return np.log(1. / (1 - p * graph))
+
+
+def simulate_cascade(x, graph):
+    """
+    Simulate an IC cascade given a graph and initial state.
+
+    For each time step we yield:
+        - susc: the nodes susceptible at the beginning of this time step
+        - x: the subset of susc who became infected
+    """
+    yield x, np.zeros(graph.shape[0], dtype=bool)
+    susc = np.ones(graph.shape[0], dtype=bool)
+    while np.any(x):
+        susc = susc ^ x  # nodes infected at previous step are now inactive
+        if not np.any(susc):
+            break
+        x = 1 - np.exp(-np.dot(graph.T, x))
+        y = nr.random(x.shape[0])
+        x = (x >= y) & susc
+        yield x, susc
+
+
+def random_source(graph, node_p=None):
+    n_nodes = graph.shape[0]
+    if node_p is None:
+        node_p = np.ones(n_nodes) / n_nodes
+    x0 = np.zeros(graph.shape[0], dtype=bool)
+    x0[nr.choice(n_nodes, p=node_p)] = True
+    return x0
+
+
+def simulate_cascades(n_obs, graph, source=random_source):
+    n_nodes = graph.shape[0]
+    x_obs = np.zeros((n_obs, n_nodes), dtype=bool)
+    s_obs = np.zeros((n_obs, n_nodes), dtype=bool)
+    i = 0
+    while i < n_obs:
+        for x, s in simulate_cascade(source(graph), graph):
+            x_obs[i] = x
+            s_obs[i] = s
+            i += 1
+            if i >= n_obs:
+                break
+    return x_obs, s_obs
-- 
cgit v1.2.3-70-g09d2