import matplotlib.pyplot as plt import numpy as np import networkx as nx import random DATASETS = ["enron", "gpg", "livejournal", "epinions"] #DATASETS = ["enron", "gpg", "epinions"] def build_graph(dataset): d = {} with open(dataset + ".txt") as f: for line in f: u, v = map(int, line.strip().split()) if u in d: d[u].append(v) else: d[u] = [v] if dataset == "gpg": if v in d: d[v].append(u) else: d[v] = [u] return d def degree_distributions(dataset): graph = build_graph(dataset) with open(dataset + "_distr.txt", "w") as f: for u in graph: f.write(str(len(graph[u])) + "\n") def plot_degrees(log=False): plt.figure(figsize=(8, 6)) for i, dataset in enumerate(DATASETS): values = [int(line.strip()) for line in open(dataset + "_distr.txt")] d = {} for j, value in enumerate(values): if value in d: d[value] += 1 else: d[value] = 1 ax = plt.subplot(2, 2, i + 1) if log: ax.set_yscale("log") ax.set_xscale("log") a, b = zip(*d.iteritems()) plt.plot(a, np.array(b) / float(j), ".") plt.xlabel("Degree") plt.ylabel("Fraction of nodes") plt.title(dataset) plt.tight_layout() plt.savefig("degr.pdf") def centrality(dataset): random.seed() graph = build_graph(dataset) G = nx.from_dict_of_lists(graph) pop = random.sample(graph.keys(), 1000) l = [] for i, u in enumerate(pop): print i lengths = nx.single_source_shortest_path_length(G, u) lengths = np.array([v for v in lengths.values() if v != 0]) s = np.sum(1. / lengths) l.append(s) with open(dataset + "_centrality.txt", "w") as f: for c in l: f.write(str(c) + "\n") def plot_centrality(log=False): plt.figure(figsize=(8, 6)) for i, dataset in enumerate(DATASETS): values = [float(line.strip()) for line in open(dataset + "_centrality.txt")] plt.subplot(2, 2, i + 1) n = len(values) h, b = np.histogram(values, bins=30) width = 0.7 * (b[1] - b[0]) plt.bar(b[:-1], h / float(n), width=width) plt.xlabel("Centrality") plt.ylabel("Fraction of nodes") plt.title(dataset) plt.tight_layout() plt.savefig("centrality.pdf") def pagerank(G, alpha=0.85, max_iter=100): if not G.is_directed(): D = G.to_directed() else: D = G W = nx.stochastic_graph(D) N = W.number_of_nodes() x = dict.fromkeys(W, 1.0 / N) p = dict.fromkeys(W, 1.0 / N) dangling_weights = p dangling_nodes = [n for n in W if W.out_degree(n) == 0.0] for i in range(max_iter): print i xlast = x x = dict.fromkeys(xlast.keys(), 0) danglesum = alpha * sum(xlast[n] for n in dangling_nodes) for n in x: for nbr in W[n]: x[nbr] += alpha * xlast[n] * W[n][nbr]["weight"] x[n] += danglesum * dangling_weights[n] + (1.0 - alpha) * p[n] return x def voter(G, max_iter=100): if not G.is_directed(): D = G.to_directed() else: D = G W = nx.stochastic_graph(D) x = dict.fromkeys(W, 1.0) for i in range(max_iter): print i xlast = x x = dict.fromkeys(xlast.keys(), 0) for n in x: for nbr in W[n]: x[nbr] += xlast[n] * W[n][nbr]["weight"] return x def page_rank(dataset, n=100): graph = build_graph(dataset) G = nx.from_dict_of_lists(graph) x = pagerank(G, max_iter=n) with open(dataset + "_pr.txt", "w") as f: for v in x.itervalues(): f.write(str(v) + "\n") def voter_model(dataset, n=100): graph = build_graph(dataset) G = nx.from_dict_of_lists(graph) x = voter(G, max_iter=n) with open(dataset + "_voter.txt", "w") as f: for v in x.itervalues(): f.write(str(v) + "\n") def plot_pr(): plt.figure(figsize=(8, 6)) for i, dataset in enumerate(DATASETS): values = [float(line.strip()) for line in open(dataset + "_pr.txt")] low, high = min(values), max(values) bins = 500 thresholds = [low + l * (high - low) / bins for l in xrange(bins)] d = {} for j, value in enumerate(values): for k, t in enumerate(thresholds): if value > t: u = thresholds[k - 1] if u in d: d[u] += 1 else: d[u] = 1 ax = plt.subplot(2, 2, i + 1) ax.set_yscale("log") ax.set_xscale("log") a, b = zip(*d.iteritems()) plt.plot(a, np.array(b) / float(j), ".") plt.xlabel("PageRank") plt.ylabel("Fraction of nodes") plt.title(dataset) plt.tight_layout() plt.savefig("pr.pdf") def plot_voter(): plt.figure(figsize=(8, 6)) for i, dataset in enumerate(DATASETS): values = [float(line.strip()) for line in open(dataset + "_voter.txt")] low, high = min(values), max(values) bins = 500 thresholds = [low + l * (high - low) / float(bins) for l in xrange(bins)] d = {} for j, value in enumerate(values): for k, t in enumerate(thresholds): if value > t: u = thresholds[k - 1] if u in d: d[u] += 1 else: d[u] = 1 ax = plt.subplot(2, 2, i + 1) ax.set_yscale("log") ax.set_xscale("log") a, b = zip(*d.iteritems()) plt.plot(a, np.array(b) / float(j), ".") plt.xlabel("Voter Influence") plt.ylabel("Fraction of nodes") plt.title(dataset) plt.tight_layout() plt.savefig("voter.pdf") if __name__ == "__main__": #degree_distributions("gpg") #plot_degrees(log=False) #for dataset in DATASETS: # voter_model(dataset) #plot_centrality() #plot_pr() #page_rank("epinions") plot_voter()