From f1762904c648b2089031ba6ce46ccaaac4f3514c Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Mon, 30 Nov 2015 19:57:58 -0500 Subject: Big code cleanup --- simulation/active_blocks.py | 50 ++++++++----------- simulation/bayes.py | 117 -------------------------------------------- simulation/main.py | 101 -------------------------------------- simulation/mcmc.py | 117 ++++++++++++++++++++++++++++++++++++++++++++ simulation/mle.py | 72 +++++++++++++++++++++++++++ simulation/mleNode.py | 72 --------------------------- simulation/utils.py | 64 ++++++++++++++++++++++++ 7 files changed, 273 insertions(+), 320 deletions(-) delete mode 100644 simulation/bayes.py delete mode 100644 simulation/main.py create mode 100644 simulation/mcmc.py create mode 100644 simulation/mle.py delete mode 100644 simulation/mleNode.py create mode 100644 simulation/utils.py (limited to 'simulation') diff --git a/simulation/active_blocks.py b/simulation/active_blocks.py index 569cb6c..a1f6e76 100644 --- a/simulation/active_blocks.py +++ b/simulation/active_blocks.py @@ -1,11 +1,12 @@ -import main as mn +import utils import theano from theano import tensor as tsr import blocks -import blocks.algorithms, blocks.main_loop, blocks.extensions.monitoring +from blocks import algorithms, main_loop +import blocks.extensions as be +import blocks.extensions.monitoring as bm import picklable_itertools import numpy as np -from six.moves import range import fuel import fuel.datasets import collections @@ -19,28 +20,16 @@ class LearnedDataset(fuel.datasets.Dataset): """ provides_sources = ('x', 's') - def __init__(self, node_p, graph, source=mn.var_source, **kwargs): + def __init__(self, node_p, graph, **kwargs): super(LearnedDataset, self).__init__(**kwargs) self.node_p = node_p self.graph = graph - self.n_cascades = 1 # nbr of cascades of total size approx = request - self.source = lambda graph, t : source(graph, t, self.node_p) + self.source = lambda graph: utils.random_source(graph, self.node_p) def get_data(self, state=None, request=None): - floatX = 'int8' - x_obs = np.empty((request, len(self.graph)), dtype=floatX) - s_obs = np.empty((request, len(self.graph)), dtype=floatX) - i = 0 - while i < request: - x_tmp, s_tmp = mn.build_cascade_list( - mn.simulate_cascades(self.n_cascades, self.graph, self.source), - collapse=True - ) - x_obs[i:i + len(x_tmp)] = x_tmp[:request - i] - s_obs[i:i + len(x_tmp)] = s_tmp[:request - i] - i += len(x_tmp) - self.n_cascades += 1 # learn optimal nbr in loop - self.n_cascades = max(1, self.n_cascades - 2) + # floatX = 'int8' + x_obs, s_obs = utils.simulate_cascades(request, self.graph, self.source) + return (x_obs, s_obs) @@ -115,8 +104,9 @@ def create_fixed_data_stream(n_cascades, graph, batch_size, shuffle=True): -shuffle (bool): shuffle minibatches but not within minibatch, else sequential (non-shuffled) batches are used """ - cascades = mn.build_cascade_list(mn.simulate_cascades(n_cascades, graph), - collapse=True) + cascades = utils.build_cascade_list( + utils.simulate_cascades(n_cascades, graph), + collapse=True) x_obs, s_obs = cascades[0], cascades[1] data_set = fuel.datasets.base.IndexableDataset(collections.OrderedDict( [('x', x_obs), ('s', s_obs)] @@ -138,23 +128,23 @@ def create_learned_data_stream(graph, batch_size): if __name__ == "__main__": batch_size = 1000 - graph = mn.create_star(1000) + graph = utils.create_wheel(1000) print('GRAPH:\n', graph, '\n-------------\n') x, s, params, cost = create_mle_model(graph) rmse, g_shared = rmse_error(graph, params) - alg = blocks.algorithms.GradientDescent( - cost=-cost, parameters=[params], step_rule=blocks.algorithms.AdaDelta() + alg = algorithms.GradientDescent( + cost=-cost, parameters=[params], step_rule=blocks.algorithms.AdaDelta() ) data_stream = create_learned_data_stream(graph, batch_size) - loop = blocks.main_loop.MainLoop( + loop = main_loop.MainLoop( alg, data_stream, extensions=[ - blocks.extensions.FinishAfter(after_n_batches = 10**4), - blocks.extensions.monitoring.TrainingDataMonitoring([cost, params, - rmse, g_shared], after_batch=True), - blocks.extensions.Printing(every_n_batches = 10), + be.FinishAfter(after_n_batches=10**4), + bm.TrainingDataMonitoring([cost, params, + rmse, g_shared], after_batch=True), + be.Printing(every_n_batches=10), ActiveLearning(data_stream.dataset), ] ) diff --git a/simulation/bayes.py b/simulation/bayes.py deleted file mode 100644 index bde9e94..0000000 --- a/simulation/bayes.py +++ /dev/null @@ -1,117 +0,0 @@ -import pymc -import numpy as np - -def glm_node_setup(cascade, y_obs, prior=None, *args, **kwargs): - """ - Build an IC PyMC node-level model from: - -observed cascades: cascade - -outcome vector: y_obs - -desired PyMC prior and parameters: prior, *args - Note: we use the glm formulation: y = Bernoulli[f(x.dot(theta))] - """ - n_nodes = len(cascade[0]) - - # Container class for node's parents - theta = np.empty(n_nodes, dtype=object) - for j in xrange(n_nodes): - if prior is None: - theta[j] = pymc.Beta('theta_{}'.format(j), alpha=1, beta=1) - else: - theta[j] = prior('theta_{}'.format(j), *args, **kwargs) - - # Observed container class for cascade realization - x = np.empty(n_nodes, dtype=object) - for i, val in enumerate(cascade.T): - x[i] = pymc.Normal('x_{}'.format(i), 0, 1, value=val, observed=True) - - @pymc.deterministic - def glm_p(x=x, theta=theta): - return 1. - np.exp(-x.dot(theta)) - - @pymc.observed - def y(glm_p=glm_p, value=y_obs): - return pymc.bernoulli_like(value, glm_p) - - return pymc.Model([y, pymc.Container(theta), pymc.Container(x)]) - - -def formatLabel(s, n): - return '0'*(len(str(n)) - len(str(s))) + str(s) - - -def mc_graph_setup(infected, susceptible, prior=None, *args, **kwargs): - """ - Build an IC PyMC graph-level model from: - -infected nodes over time: list/tuple of list/tuple of np.array - -susceptible nodes over time: same format as above - Note: we use the Markov Chain formulation: X_{t+1}|X_t,theta = f(X_t.theta) - """ - - # Container class for graph parameters - n_nodes = len(infected[0][0]) - theta = np.empty((n_nodes,n_nodes), dtype=object) - if prior is None: - for i in xrange(n_nodes): - for j in xrange(n_nodes): - theta[i, j] = pymc.Beta('theta_{}{}'.format(formatLabel(i, - n_nodes-1), formatLabel(j, n_nodes-1)), - alpha=1, beta=1) - else: - theta = prior(theta=theta, *args, **kwargs) - - # Container class for cascade realization - x = {} - for i, cascade in enumerate(infected): - for j, step in enumerate(cascade): - for k, node in enumerate(step): - if j and susceptible[i][j][k]: - p = 1. - pymc.exp(-cascade[j-1].dot(theta[k])) - else: - p = .5 - x[i, j, k] = pymc.Bernoulli('x_{}{}{}'.format(i, j, k), p=p, - value=node, observed=True) - - return pymc.Model([pymc.Container(theta), pymc.Container(x)]) - - -if __name__=="__main__": - import main - import matplotlib.pyplot as plt - import seaborn - seaborn.set_style('whitegrid') - g = np.array([[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]]) - p = 0.5 - g = np.log(1. / (1 - p * g)) - - print('running the graph-level MC set-up') - n_nodes = len(g) - cascades = main.simulate_cascades(1000, g) - infected, susc = main.build_cascade_list(cascades) - model = mc_graph_setup(infected, susc) - mcmc = pymc.MCMC(model) - mcmc.sample(10**4, 1000) - fig, ax = plt.subplots(len(g), len(g)) - for i in xrange(n_nodes): - for j in xrange(n_nodes): - if n_nodes < 5: - ax[i,j].locator_params(nbins=3, axis='x') - else: - ax[i, j].get_xaxis().set_ticks([]) - ax[i, j].get_yaxis().set_ticks([]) - it, jt = formatLabel(i, n_nodes-1), formatLabel(j, n_nodes-1) - ax[i,j].hist(mcmc.trace('theta_{}{}'.format(it,jt))[:], normed=True) - ax[i, j].set_xlim([0,1]) - ax[i, j].plot([g[i, j]]*2, [0, ax[i,j].get_ylim()[-1]], color='red') - plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=.1) - plt.show() - - print('running the node level set-up') - node = 0 - cascades = main.simulate_cascades(100, g) - cascade, y_obs = main.build_matrix(cascades, node) - model = glm_node_setup(cascade, y_obs) - mcmc = pymc.MCMC(model) - mcmc.sample(1e5, 1e4) - plt.hist(mcmc.trace('theta_1')[:], bins=1e2) - plt.show() - diff --git a/simulation/main.py b/simulation/main.py deleted file mode 100644 index 81133c7..0000000 --- a/simulation/main.py +++ /dev/null @@ -1,101 +0,0 @@ -import mleNode as mn - -import numpy as np -from numpy.linalg import norm -import numpy.random as nr -from scipy.optimize import minimize -import matplotlib.pyplot as plt -import seaborn -from random import random, randint -from six.moves import range - -seaborn.set_style("white") - - -def create_random_graph(n_nodes, p=.5): - graph = .5 * np.random.binomial(2, p=.5, size=(n_nodes, n_nodes)) - for k in range(len(graph)): - graph[k, k] = 0 - return np.log(1. / (1 - p * graph)) - - -def create_star(n_nodes, p=.5): - graph = np.zeros((n_nodes, n_nodes)) - graph[0] = np.ones((n_nodes,)) - graph[0, 0] = 0 - for index, row in enumerate(graph[1:-1]): - row[index + 1] = 1 - graph[-1, 1] = 1 - return np.log(1. / (1 - p * graph)) - - -def simulate_cascade(x, graph): - """ - Simulate an IC cascade given a graph and initial state. - - For each time step we yield: - - susc: the nodes susceptible at the beginning of this time step - - x: the subset of susc who became infected - """ - yield x, np.zeros(graph.shape[0], dtype=bool) - susc = np.ones(graph.shape[0], dtype=bool) - while np.any(x): - susc = susc ^ x # nodes infected at previous step are now inactive - if not np.any(susc): - break - x = 1 - np.exp(-np.dot(graph.T, x)) - y = nr.random(x.shape[0]) - x = (x >= y) & susc - yield x, susc - - -def uniform_source(graph, *args, **kwargs): - x0 = np.zeros(graph.shape[0], dtype=bool) - x0[nr.randint(0, graph.shape[0])] = True - return x0 - - -def var_source(graph, t, node_p=None, *args, **kwargs): - if node_p is None: - node_p = np.ones(len(graph)) / len(graph) - x0 = np.zeros(graph.shape[0], dtype=bool) - x0[nr.choice(a=len(graph), p=node_p)] = True - return x0 - - -def simulate_cascades(n, graph, source=uniform_source): - for t in range(n): - x0 = source(graph, t) - yield simulate_cascade(x0, graph) - - -def build_cascade_list(cascades, collapse=False): - x, s = [], [] - for cascade in cascades: - xlist, slist = zip(*cascade) - x.append(np.vstack(xlist)) - s.append(np.vstack(slist)) - if not collapse: - return x, s - else: - return np.vstack(x), np.vstack(s) - - -if __name__ == "__main__": - #g = np.array([[0, 0, 1], [0, 0, 0.5], [0, 0, 0]]) - #p = 0.5 - #g = np.log(1. / (1 - p * g)) - g = create_random_graph(n_nodes=3) - print(g) - sizes = [10**3] - for si in sizes: - cascades = simulate_cascades(si, g) - cascade, y_obs = mn.build_matrix(cascades, 2) - print(mn.infer(cascade, y_obs)) - #conf = mn.bootstrap(cascade, y_obs, n_iter=100) - #estimand = np.linalg.norm(np.delete(conf - g[0], 0, axis=1), axis=1) - #plt.hist(estimand, bins=40) - #plt.show() - #error.append(mn.confidence_interval(*np.histogram(estimand, bins=50))) - #plt.plot(sizes, error) - #plt.show() diff --git a/simulation/mcmc.py b/simulation/mcmc.py new file mode 100644 index 0000000..bde9e94 --- /dev/null +++ b/simulation/mcmc.py @@ -0,0 +1,117 @@ +import pymc +import numpy as np + +def glm_node_setup(cascade, y_obs, prior=None, *args, **kwargs): + """ + Build an IC PyMC node-level model from: + -observed cascades: cascade + -outcome vector: y_obs + -desired PyMC prior and parameters: prior, *args + Note: we use the glm formulation: y = Bernoulli[f(x.dot(theta))] + """ + n_nodes = len(cascade[0]) + + # Container class for node's parents + theta = np.empty(n_nodes, dtype=object) + for j in xrange(n_nodes): + if prior is None: + theta[j] = pymc.Beta('theta_{}'.format(j), alpha=1, beta=1) + else: + theta[j] = prior('theta_{}'.format(j), *args, **kwargs) + + # Observed container class for cascade realization + x = np.empty(n_nodes, dtype=object) + for i, val in enumerate(cascade.T): + x[i] = pymc.Normal('x_{}'.format(i), 0, 1, value=val, observed=True) + + @pymc.deterministic + def glm_p(x=x, theta=theta): + return 1. - np.exp(-x.dot(theta)) + + @pymc.observed + def y(glm_p=glm_p, value=y_obs): + return pymc.bernoulli_like(value, glm_p) + + return pymc.Model([y, pymc.Container(theta), pymc.Container(x)]) + + +def formatLabel(s, n): + return '0'*(len(str(n)) - len(str(s))) + str(s) + + +def mc_graph_setup(infected, susceptible, prior=None, *args, **kwargs): + """ + Build an IC PyMC graph-level model from: + -infected nodes over time: list/tuple of list/tuple of np.array + -susceptible nodes over time: same format as above + Note: we use the Markov Chain formulation: X_{t+1}|X_t,theta = f(X_t.theta) + """ + + # Container class for graph parameters + n_nodes = len(infected[0][0]) + theta = np.empty((n_nodes,n_nodes), dtype=object) + if prior is None: + for i in xrange(n_nodes): + for j in xrange(n_nodes): + theta[i, j] = pymc.Beta('theta_{}{}'.format(formatLabel(i, + n_nodes-1), formatLabel(j, n_nodes-1)), + alpha=1, beta=1) + else: + theta = prior(theta=theta, *args, **kwargs) + + # Container class for cascade realization + x = {} + for i, cascade in enumerate(infected): + for j, step in enumerate(cascade): + for k, node in enumerate(step): + if j and susceptible[i][j][k]: + p = 1. - pymc.exp(-cascade[j-1].dot(theta[k])) + else: + p = .5 + x[i, j, k] = pymc.Bernoulli('x_{}{}{}'.format(i, j, k), p=p, + value=node, observed=True) + + return pymc.Model([pymc.Container(theta), pymc.Container(x)]) + + +if __name__=="__main__": + import main + import matplotlib.pyplot as plt + import seaborn + seaborn.set_style('whitegrid') + g = np.array([[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]]) + p = 0.5 + g = np.log(1. / (1 - p * g)) + + print('running the graph-level MC set-up') + n_nodes = len(g) + cascades = main.simulate_cascades(1000, g) + infected, susc = main.build_cascade_list(cascades) + model = mc_graph_setup(infected, susc) + mcmc = pymc.MCMC(model) + mcmc.sample(10**4, 1000) + fig, ax = plt.subplots(len(g), len(g)) + for i in xrange(n_nodes): + for j in xrange(n_nodes): + if n_nodes < 5: + ax[i,j].locator_params(nbins=3, axis='x') + else: + ax[i, j].get_xaxis().set_ticks([]) + ax[i, j].get_yaxis().set_ticks([]) + it, jt = formatLabel(i, n_nodes-1), formatLabel(j, n_nodes-1) + ax[i,j].hist(mcmc.trace('theta_{}{}'.format(it,jt))[:], normed=True) + ax[i, j].set_xlim([0,1]) + ax[i, j].plot([g[i, j]]*2, [0, ax[i,j].get_ylim()[-1]], color='red') + plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=.1) + plt.show() + + print('running the node level set-up') + node = 0 + cascades = main.simulate_cascades(100, g) + cascade, y_obs = main.build_matrix(cascades, node) + model = glm_node_setup(cascade, y_obs) + mcmc = pymc.MCMC(model) + mcmc.sample(1e5, 1e4) + plt.hist(mcmc.trace('theta_1')[:], bins=1e2) + plt.show() + diff --git a/simulation/mle.py b/simulation/mle.py new file mode 100644 index 0000000..c6b2e85 --- /dev/null +++ b/simulation/mle.py @@ -0,0 +1,72 @@ +import numpy as np +from scipy.optimize import minimize + + +def likelihood(p, x, y): + a = np.dot(x, p) + return np.log(1. - np.exp(-a[y])).sum() - a[~y].sum() + + +def likelihood_gradient(p, x, y): + a = np.dot(x, p) + l = np.log(1. - np.exp(-a[y])).sum() - a[~y].sum() + g1 = 1. / (np.exp(a[y]) - 1.) + g = (x[y] * g1[:, np.newaxis]).sum(0) - x[~y].sum(0) + return l, g + + +def test_gradient(x, y): + eps = 1e-10 + for i in xrange(x.shape[1]): + p = 0.5 * np.ones(x.shape[1]) + a = np.dot(x, p) + g1 = 1. / (np.exp(a[y]) - 1.) + g = (x[y] * g1[:, np.newaxis]).sum(0) - x[~y].sum(0) + p[i] += eps + f1 = likelihood(p, x, y) + p[i] -= 2 * eps + f2 = likelihood(p, x, y) + print(g[i], (f1 - f2) / (2 * eps)) + + +def infer(x, y): + def f(p): + l, g = likelihood_gradient(p, x, y) + return -l, -g + x0 = np.ones(x.shape[1]) + bounds = [(1e-10, None)] * x.shape[1] + return minimize(f, x0, jac=True, bounds=bounds, method="L-BFGS-B").x + + +def bootstrap(x, y, n_iter=100): + rval = np.zeros((n_iter, x.shape[1])) + for i in xrange(n_iter): + indices = np.random.choice(len(y), replace=False, size=int(len(y)*.9)) + rval[i] = infer(x[indices], y[indices]) + return rval + + +def confidence_interval(counts, bins): + k = 0 + while np.sum(counts[len(counts)/2-k:len(counts)/2+k]) <= .95*np.sum(counts): + k += 1 + return bins[len(bins)/2-k], bins[len(bins)/2+k] + + +def build_matrix(cascades, node): + + def aux(cascade, node): + xlist, slist = zip(*cascade) + indices = [i for i, s in enumerate(slist) if s[node] and i >= 1] + if indices: + x = np.vstack(xlist[i-1] for i in indices) + y = np.array([xlist[i][node] for i in indices]) + return x, y + else: + return None + + pairs = (aux(cascade, node) for cascade in cascades) + xs, ys = zip(*(pair for pair in pairs if pair)) + x = np.vstack(xs) + y = np.concatenate(ys) + return x, y diff --git a/simulation/mleNode.py b/simulation/mleNode.py deleted file mode 100644 index c6b2e85..0000000 --- a/simulation/mleNode.py +++ /dev/null @@ -1,72 +0,0 @@ -import numpy as np -from scipy.optimize import minimize - - -def likelihood(p, x, y): - a = np.dot(x, p) - return np.log(1. - np.exp(-a[y])).sum() - a[~y].sum() - - -def likelihood_gradient(p, x, y): - a = np.dot(x, p) - l = np.log(1. - np.exp(-a[y])).sum() - a[~y].sum() - g1 = 1. / (np.exp(a[y]) - 1.) - g = (x[y] * g1[:, np.newaxis]).sum(0) - x[~y].sum(0) - return l, g - - -def test_gradient(x, y): - eps = 1e-10 - for i in xrange(x.shape[1]): - p = 0.5 * np.ones(x.shape[1]) - a = np.dot(x, p) - g1 = 1. / (np.exp(a[y]) - 1.) - g = (x[y] * g1[:, np.newaxis]).sum(0) - x[~y].sum(0) - p[i] += eps - f1 = likelihood(p, x, y) - p[i] -= 2 * eps - f2 = likelihood(p, x, y) - print(g[i], (f1 - f2) / (2 * eps)) - - -def infer(x, y): - def f(p): - l, g = likelihood_gradient(p, x, y) - return -l, -g - x0 = np.ones(x.shape[1]) - bounds = [(1e-10, None)] * x.shape[1] - return minimize(f, x0, jac=True, bounds=bounds, method="L-BFGS-B").x - - -def bootstrap(x, y, n_iter=100): - rval = np.zeros((n_iter, x.shape[1])) - for i in xrange(n_iter): - indices = np.random.choice(len(y), replace=False, size=int(len(y)*.9)) - rval[i] = infer(x[indices], y[indices]) - return rval - - -def confidence_interval(counts, bins): - k = 0 - while np.sum(counts[len(counts)/2-k:len(counts)/2+k]) <= .95*np.sum(counts): - k += 1 - return bins[len(bins)/2-k], bins[len(bins)/2+k] - - -def build_matrix(cascades, node): - - def aux(cascade, node): - xlist, slist = zip(*cascade) - indices = [i for i, s in enumerate(slist) if s[node] and i >= 1] - if indices: - x = np.vstack(xlist[i-1] for i in indices) - y = np.array([xlist[i][node] for i in indices]) - return x, y - else: - return None - - pairs = (aux(cascade, node) for cascade in cascades) - xs, ys = zip(*(pair for pair in pairs if pair)) - x = np.vstack(xs) - y = np.concatenate(ys) - return x, y diff --git a/simulation/utils.py b/simulation/utils.py new file mode 100644 index 0000000..aad7771 --- /dev/null +++ b/simulation/utils.py @@ -0,0 +1,64 @@ +import numpy as np +import numpy.random as nr +from six.moves import range + + +def create_random_graph(n_nodes, p=.5): + graph = .5 * np.random.binomial(2, p=.5, size=(n_nodes, n_nodes)) + for k in range(len(graph)): + graph[k, k] = 0 + return np.log(1. / (1 - p * graph)) + + +def create_wheel(n_nodes, p=.5): + graph = np.zeros((n_nodes, n_nodes)) + graph[0] = np.ones(n_nodes) + graph[0, 0] = 0 + for i in range(1, n_nodes-1): + graph[i, i + 1] = 1 + graph[n_nodes-1, 1] = 1 + return np.log(1. / (1 - p * graph)) + + +def simulate_cascade(x, graph): + """ + Simulate an IC cascade given a graph and initial state. + + For each time step we yield: + - susc: the nodes susceptible at the beginning of this time step + - x: the subset of susc who became infected + """ + yield x, np.zeros(graph.shape[0], dtype=bool) + susc = np.ones(graph.shape[0], dtype=bool) + while np.any(x): + susc = susc ^ x # nodes infected at previous step are now inactive + if not np.any(susc): + break + x = 1 - np.exp(-np.dot(graph.T, x)) + y = nr.random(x.shape[0]) + x = (x >= y) & susc + yield x, susc + + +def random_source(graph, node_p=None): + n_nodes = graph.shape[0] + if node_p is None: + node_p = np.ones(n_nodes) / n_nodes + x0 = np.zeros(graph.shape[0], dtype=bool) + x0[nr.choice(n_nodes, p=node_p)] = True + return x0 + + +def simulate_cascades(n_obs, graph, source=random_source): + n_nodes = graph.shape[0] + x_obs = np.zeros((n_obs, n_nodes), dtype=bool) + s_obs = np.zeros((n_obs, n_nodes), dtype=bool) + i = 0 + while i < n_obs: + for x, s in simulate_cascade(source(graph), graph): + x_obs[i] = x + s_obs[i] = s + i += 1 + if i >= n_obs: + break + return x_obs, s_obs -- cgit v1.2.3-70-g09d2