diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2014-10-30 00:00:29 -0400 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2014-10-30 00:00:29 -0400 |
| commit | 59560fca8caa4d76abe2fe8597d806e0a897874a (patch) | |
| tree | c8441b1200ffee2d6f5ebaecc9ab6077f6ea7772 /ps1/code/main.py | |
| parent | ab978841c3da7cf1e6f7e21132a88a13715db773 (diff) | |
| download | cs284-59560fca8caa4d76abe2fe8597d806e0a897874a.tar.gz | |
[ps1] Add code for Problem 5
Diffstat (limited to 'ps1/code/main.py')
| -rw-r--r-- | ps1/code/main.py | 219 |
1 files changed, 219 insertions, 0 deletions
diff --git a/ps1/code/main.py b/ps1/code/main.py new file mode 100644 index 0000000..5ddb979 --- /dev/null +++ b/ps1/code/main.py @@ -0,0 +1,219 @@ +import matplotlib.pyplot as plt +import numpy as np +import networkx as nx +import random + +DATASETS = ["enron", "gpg", "livejournal", "epinions"] +#DATASETS = ["enron", "gpg", "epinions"] + + +def build_graph(dataset): + d = {} + with open(dataset + ".txt") as f: + for line in f: + u, v = map(int, line.strip().split()) + if u in d: + d[u].append(v) + else: + d[u] = [v] + if dataset == "gpg": + if v in d: + d[v].append(u) + else: + d[v] = [u] + + return d + + +def degree_distributions(dataset): + graph = build_graph(dataset) + with open(dataset + "_distr.txt", "w") as f: + for u in graph: + f.write(str(len(graph[u])) + "\n") + + +def plot_degrees(log=False): + plt.figure(figsize=(8, 6)) + for i, dataset in enumerate(DATASETS): + values = [int(line.strip()) for line in open(dataset + "_distr.txt")] + d = {} + for j, value in enumerate(values): + if value in d: + d[value] += 1 + else: + d[value] = 1 + ax = plt.subplot(2, 2, i + 1) + if log: + ax.set_yscale("log") + ax.set_xscale("log") + a, b = zip(*d.iteritems()) + plt.plot(a, np.array(b) / float(j), ".") + plt.xlabel("Degree") + plt.ylabel("Fraction of nodes") + plt.title(dataset) + plt.tight_layout() + plt.savefig("degr.pdf") + + +def centrality(dataset): + random.seed() + graph = build_graph(dataset) + G = nx.from_dict_of_lists(graph) + pop = random.sample(graph.keys(), 1000) + l = [] + for i, u in enumerate(pop): + print i + lengths = nx.single_source_shortest_path_length(G, u) + lengths = np.array([v for v in lengths.values() if v != 0]) + s = np.sum(1. / lengths) + l.append(s) + with open(dataset + "_centrality.txt", "w") as f: + for c in l: + f.write(str(c) + "\n") + + +def plot_centrality(log=False): + plt.figure(figsize=(8, 6)) + for i, dataset in enumerate(DATASETS): + values = [float(line.strip()) + for line in open(dataset + "_centrality.txt")] + plt.subplot(2, 2, i + 1) + n = len(values) + h, b = np.histogram(values, bins=30) + width = 0.7 * (b[1] - b[0]) + plt.bar(b[:-1], h / float(n), width=width) + plt.xlabel("Centrality") + plt.ylabel("Fraction of nodes") + plt.title(dataset) + plt.tight_layout() + plt.savefig("centrality.pdf") + + +def pagerank(G, alpha=0.85, max_iter=100): + if not G.is_directed(): + D = G.to_directed() + else: + D = G + W = nx.stochastic_graph(D) + N = W.number_of_nodes() + x = dict.fromkeys(W, 1.0 / N) + p = dict.fromkeys(W, 1.0 / N) + + dangling_weights = p + dangling_nodes = [n for n in W if W.out_degree(n) == 0.0] + + for i in range(max_iter): + print i + xlast = x + x = dict.fromkeys(xlast.keys(), 0) + danglesum = alpha * sum(xlast[n] for n in dangling_nodes) + for n in x: + for nbr in W[n]: + x[nbr] += alpha * xlast[n] * W[n][nbr]["weight"] + x[n] += danglesum * dangling_weights[n] + (1.0 - alpha) * p[n] + return x + + +def voter(G, max_iter=100): + if not G.is_directed(): + D = G.to_directed() + else: + D = G + W = nx.stochastic_graph(D) + x = dict.fromkeys(W, 1.0) + + for i in range(max_iter): + print i + xlast = x + x = dict.fromkeys(xlast.keys(), 0) + for n in x: + for nbr in W[n]: + x[nbr] += xlast[n] * W[n][nbr]["weight"] + return x + + +def page_rank(dataset, n=100): + graph = build_graph(dataset) + G = nx.from_dict_of_lists(graph) + x = pagerank(G, max_iter=n) + with open(dataset + "_pr.txt", "w") as f: + for v in x.itervalues(): + f.write(str(v) + "\n") + + +def voter_model(dataset, n=100): + graph = build_graph(dataset) + G = nx.from_dict_of_lists(graph) + x = voter(G, max_iter=n) + with open(dataset + "_voter.txt", "w") as f: + for v in x.itervalues(): + f.write(str(v) + "\n") + + +def plot_pr(): + plt.figure(figsize=(8, 6)) + for i, dataset in enumerate(DATASETS): + values = [float(line.strip()) for line in open(dataset + "_pr.txt")] + low, high = min(values), max(values) + bins = 500 + thresholds = [low + l * (high - low) / bins for l in xrange(bins)] + d = {} + for j, value in enumerate(values): + for k, t in enumerate(thresholds): + if value > t: + u = thresholds[k - 1] + if u in d: + d[u] += 1 + else: + d[u] = 1 + + ax = plt.subplot(2, 2, i + 1) + ax.set_yscale("log") + ax.set_xscale("log") + a, b = zip(*d.iteritems()) + plt.plot(a, np.array(b) / float(j), ".") + plt.xlabel("PageRank") + plt.ylabel("Fraction of nodes") + plt.title(dataset) + plt.tight_layout() + plt.savefig("pr.pdf") + + +def plot_voter(): + plt.figure(figsize=(8, 6)) + for i, dataset in enumerate(DATASETS): + values = [float(line.strip()) for line in open(dataset + "_voter.txt")] + low, high = min(values), max(values) + bins = 500 + thresholds = [low + l * (high - low) / float(bins) + for l in xrange(bins)] + d = {} + for j, value in enumerate(values): + for k, t in enumerate(thresholds): + if value > t: + u = thresholds[k - 1] + if u in d: + d[u] += 1 + else: + d[u] = 1 + + ax = plt.subplot(2, 2, i + 1) + ax.set_yscale("log") + ax.set_xscale("log") + a, b = zip(*d.iteritems()) + plt.plot(a, np.array(b) / float(j), ".") + plt.xlabel("Voter Influence") + plt.ylabel("Fraction of nodes") + plt.title(dataset) + plt.tight_layout() + plt.savefig("voter.pdf") + +if __name__ == "__main__": + #degree_distributions("gpg") + #plot_degrees(log=False) + #for dataset in DATASETS: + # voter_model(dataset) + #plot_centrality() + #plot_pr() + #page_rank("epinions") + plot_voter() |
