diff options
Diffstat (limited to 'facebook_analysis')
| -rw-r--r-- | facebook_analysis/Makefile | 4 | ||||
| -rw-r--r-- | facebook_analysis/ads.pyx | 8 | ||||
| -rw-r--r-- | facebook_analysis/analyze.py | 55 | ||||
| -rw-r--r-- | facebook_analysis/seed.py | 125 |
4 files changed, 154 insertions, 38 deletions
diff --git a/facebook_analysis/Makefile b/facebook_analysis/Makefile index e3df848..0b011a2 100644 --- a/facebook_analysis/Makefile +++ b/facebook_analysis/Makefile @@ -1,3 +1,7 @@ all: python2 setup.py build_ext --inplace cython2 -a ads.pyx + +clean: + python2 setup.py clean + rm ads.c ads.html ads.so diff --git a/facebook_analysis/ads.pyx b/facebook_analysis/ads.pyx index 2682456..40d6391 100644 --- a/facebook_analysis/ads.pyx +++ b/facebook_analysis/ads.pyx @@ -91,14 +91,16 @@ cdef float merge_opt_p_sample(list l1, list l2, int t, dict degrees, float p): i = j = 0 n = s = 0. cdef int k + cdef int l cdef dict a cdef float r k = 0 r = 0 + l = 0 a = {} - for k in xrange(len(degrees)**2): - for d in degrees: - a[d] = random.random() + for k in xrange(t**2): + for l in xrange(t): + random.random() while n < t: if i == n1 and j == n2: break diff --git a/facebook_analysis/analyze.py b/facebook_analysis/analyze.py index c5e6feb..9b7f893 100644 --- a/facebook_analysis/analyze.py +++ b/facebook_analysis/analyze.py @@ -10,10 +10,14 @@ import pulp import sys from random import seed, betavariate, normalvariate import matplotlib.pyplot as plt +from scipy.sparse import coo_matrix +from sklearn.preprocessing import normalize as nm DATA_DIR = "../facebook_data" -DATASETS = ["hbo", "nyt", "lp", "google", "lmpt", "gp", "kiva", "coachella", - "peet", "gap"] +#DATASETS = ["hbo", "nyt", "lp", "google", "lmpt", "gp", "kiva", "coachella", +# "peet", "gap"] +DATASETS = ["hbo", "nyt", "lp", "google", "gp", "kiva", "coachella", + "gap"] SYNTH_DIR = "../apgl" SYNTH_DATASETS = ["b-a", "kk", "sw"] @@ -74,6 +78,48 @@ def build_graph(dataset): return build_graph1(dataset) +def build_graph3(dataset): + d = {} + e = {} + with open(dataset + ".txt") as f: + for line in f: + u, v = map(int, line.strip().split()) + d[u, v] = 1 + d[v, u] = 1 + d[u, u] = 1 + if u in e: + e[u].append(v) + else: + e[u] = [v] + i, j = zip(*d.keys()) + v = d.values() + m = coo_matrix((v, (i, j)), dtype="float") + m = nm(m, norm='l1', axis=1, copy=False) + return m, e + + +def voter(mat, node, t): + n = mat.shape[0] + v = np.zeros(n) + u = np.ones(n) + v[node] = 1 + for i in xrange(t): + v = mat.dot(v) + return v.dot(u) + + +def influence_exp(dataset, size): + mat, graph = build_graph3(dataset) + sp = sample(graph.keys(), size) + graph = {s: graph[s] for s in sp} + sd = list(sd_users(graph)) + sd += graph.keys() + for t in xrange(100): + degrees = {s: voter(mat, s, t) for s in sd} + #aps(graph, degrees, size) + print im(graph, degrees, size) + + def print_graph(dataset): graph, degrees = build_graph(dataset) with open(dataset + "_single_graph.txt", "w") as f: @@ -257,7 +303,7 @@ def lp_time(): def aps_time(): - graph, degrees = build_graph("big") + graph, degrees = build_graph("hbo") sp = sample(graph.keys(), int(sys.argv[2])) graph = {s: graph[s] for s in sp} a = int(sys.argv[1]) @@ -326,7 +372,7 @@ def stats(): if __name__ == "__main__": #for dataset in SYNTH_DATASETS: # compute_performance(dataset) - compute_performance_p("coachella", "deg") + #compute_performance_p("coachella", "power") #compute_performance("coachella") #hbo_likes() #lp_perf() @@ -338,3 +384,4 @@ if __name__ == "__main__": # with open("coachella_degrees.txt", "w") as fh: # for deg in degrees.itervalues(): # fh.write(str(deg) + "\n") + influence_exp("slashdot", 100) diff --git a/facebook_analysis/seed.py b/facebook_analysis/seed.py index 7e2b851..cba45e1 100644 --- a/facebook_analysis/seed.py +++ b/facebook_analysis/seed.py @@ -1,23 +1,80 @@ from analyze import sd_users, build_graph, DATASETS, SYNTH_DATASETS import matplotlib.pyplot as plt from matplotlib import rcParams, cm -from matplotlib.colors import Normalize from matplotlib.pyplot import plot, legend, savefig, xlabel, ylabel,\ hist, title, subplot, tight_layout, ticklabel_format, xlim, ylim from mpl_toolkits.mplot3d import Axes3D import numpy as np -import itertools mq = lambda x: x * 4 +def voter(): + with open("epinions_voter.txt") as f: + ep = [float(line) for line in f] + with open("epinions_voter_influence.txt") as f: + epr = [float(line) for line in f] + with open("slashdot_voter.txt") as f: + sl = [float(line) for line in f] + with open("slashdot_voter_influence.txt") as f: + slr = [float(line) for line in f] + a = range(1, 51) + plt.figure(figsize=(7, 3)) + plt.subplot(1, 2, 1) + plt.plot(a, ep, label="Adapt. Seeding") + plt.plot(a, epr, label="Inf. Max.") + plt.legend() + plt.title("Epinions") + plt.xlabel("t") + plt.ylabel("Performance") + plt.subplot(1, 2, 2) + plt.plot(a, sl, label="Adapt. Seeding") + plt.plot(a, slr, label="Inf. Max") + plt.legend() + plt.title("Slashdot") + plt.xlabel("t") + plt.ylabel("Performance") + plt.savefig("voter.pdf") + + +def sampling(): + with open("hbo_sampling.txt") as f: + values = [line.strip().replace(",", "").split() for line in f] + ks, ts, cs = zip(*values) + ks = map(int, ks) + ts = map(float, ts) + cs = map(float, cs) + with open("hbo_sans_sampling.txt") as f: + values = [line.strip().replace(",", "").split() for line in f] + k, t, c = zip(*values) + k = map(int, k) + t = map(float, t) + c = map(float, c) + plt.figure(figsize=(7, 3)) + plt.subplot(1, 2, 1) + plt.gca().set_yscale("log") + plt.plot(ks, ts, label="Sampling based") + plt.plot(ks, t, label="Comb. alg.") + plt.xlabel("Size") + plt.ylabel("Time (s)") + plt.legend(loc="upper left") + plt.subplot(1, 2, 2) + plt.gca().set_yscale("log") + plt.plot(ks, cs, label="Sampling based") + plt.plot(ks, c, label="Comb. alg.") + plt.legend(loc="upper left") + plt.xlabel("Size") + plt.ylabel("\# Cycles") + plt.savefig("sampling2.pdf") + + def plot_degree_distributions(): plt.figure(figsize=(7, 3)) graph, degrees = build_graph("kiva") fd_degrees = list(degrees[user] for user in graph) sd_degrees = list(degrees[user] for user in sd_users(graph)) n, bins, patches = plt.hist(fd_degrees, bins=50, cumulative=True, - label="Initial users", normed=True, + label="Core set", normed=True, alpha=0.5, histtype="stepfilled") n, bins, patches = plt.hist(sd_degrees, bins=50, cumulative=True, histtype="stepfilled", normed=True, alpha=0.5, @@ -30,25 +87,27 @@ def plot_degree_distributions(): def plot_all_performances(): - plt.figure(figsize=(7, 14)) + plt.figure(figsize=(6, 6)) for i, dataset in enumerate(DATASETS): values = [map(float, line.strip().split("\t")) for line in open(dataset + "_performance.txt")] a, im, rd, rdf, aps = zip(*values) a, im, rd, rdf, aps = [map(mq, l) for l in (a, im, rd, rdf, aps)] a = np.arange(0, 1.001, 0.1) - ax = plt.subplot(5, 2, i + 1) + ax = plt.subplot(2, 2, i + 1) #ax.set_yscale("log") - plt.plot(a, im, label="Max deg.") - plt.plot(a, rd, label="Rand.") - plt.plot(a, rdf, label="Rand. friend") + plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) + plt.plot(a, im, label="Inf. Max") + plt.plot(a, rd, label="Rand. Node") + plt.plot(a, rdf, label="Rand. Friend") plt.plot(a, aps, label="Adapt. Seeding") - plt.xlabel("Budget (fraction of the total number of users)") + plt.xlabel("Budget") plt.ylabel("Performance") + titl = dataset if dataset == "sw": - titl = "SmallWord" - if dataset == "coachella": - titl = "Conf. Model" + titl = "SmallWorld" + #if dataset == "coachella": + # titl = "Conf. Model" if dataset == "kk": titl = "Kronecker" if dataset == "b-a": @@ -58,7 +117,7 @@ def plot_all_performances(): plt.legend(loc="upper center", ncol=4, bbox_to_anchor=(0, 0, 1, 1.03), bbox_transform=plt.gcf().transFigure) plt.tight_layout() - plt.savefig("test2.pdf") + plt.savefig("perf10.pdf") def compare_performance(fn): @@ -81,7 +140,7 @@ def compare_performance(fn): def compare_performance2(fn): plots = {} - plt.figure() + plt.figure(figsize=(5, 3)) for dataset in DATASETS: values = [map(float, line.strip().split("\t")) for line in open(dataset + "_performance.txt")] @@ -97,7 +156,7 @@ def compare_performance2(fn): width = 0.35 plt.bar(ind, means, width, linewidth=0.1) plt.errorbar([i + width / 2. for i in ind], means, [mini, maxi], elinewidth=1.2, fmt="none") - plt.xticks([i + width / 2. for i in ind], a[1:]) + plt.xticks([i + width / 2. for i in ind], [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]) plt.xlim(-width, len(ind) - 1 + 2 * width) plt.xlabel("Budget") plt.ylabel("Relative improvement") @@ -116,7 +175,7 @@ def compare_dist(): sd.append(np.mean(sd_degrees)) ind = range(len(DATASETS)) width = 0.35 - plt.bar(ind, fd, width, label="Initial users", color=next(cm)) + plt.bar(ind, fd, width, label="Core users", color=next(cm)) plt.bar([i + width for i in ind], sd, width, label="Friends", color=next(cm)) plt.xlim(-width, len(ind) - 1 + 3 * width) @@ -128,6 +187,7 @@ def compare_dist(): def plot_perf_prob(): plt.figure() + rcParams["font.size"] = 10 with open("peet_performance_p.txt") as f: values = [map(float, line.strip().split("\t")) for line in f] values = zip(*values) @@ -138,36 +198,38 @@ def plot_perf_prob(): with open("peet_performance.txt") as f: values = [map(float, line.strip().split("\t")) for line in f] values = zip(*values) - plt.gca().set_yscale("log") + #plt.gca().set_yscale("log") + plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) plt.xlabel("Budget") plt.ylabel("Performance") - plt.plot(values[0], values[1], label="Max. degree") - plt.legend(loc="lower right", fontsize="small", ncol=2) + plt.plot(values[0], values[1], label="Inf. Max.") + plt.legend(loc="upper left", fontsize="small", ncol=2) xlim(xmax=450) plt.savefig("prob.pdf") def plot_hbo_likes(): plt.figure() - rcParams["font.size"] = 6 + rcParams["font.size"] = 10 with open("hbo_likes_performance.txt") as f: values = [map(float, line.strip().split("\t")) for line in f] a, im, aps, apso = zip(*values) a = np.arange(0, 1.001, 0.1) - plt.gca().set_yscale("log") - #plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) - plt.plot(a, map(mq, im), label="Max. degr.") - plt.plot(a, map(mq, aps), label="Adapt. seed. (rest.)") + #plt.gca().set_yscale("log") + plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) + plt.plot(a, map(mq, im), label="Inf. Max.") + plt.plot(a, map(mq, aps), label="Adapt. seed. (subgraph)") plt.plot(a, map(mq, apso), label="Adapt. seed.") plt.xlabel("Budget") plt.ylabel("Performance") xlim(xmax=1.1) - plt.legend(loc="lower right") + plt.legend(loc="upper left") plt.savefig("hbo_likes.pdf") def plot_3d(): - for dist in ["beta", "gauss"]: + rcParams["font.size"] = 7 + for dist in ["beta", "gauss", "power", "deg"]: fig = plt.figure() with open("coachella_performance_p_" + dist + ".txt") as f: values = [map(float, line.strip().split("\t")) for line in f] @@ -180,7 +242,7 @@ def plot_3d(): ax.plot_wireframe(x, y, perfs, linewidth=0.1) ticklabel_format(style='sci', axis='z', scilimits=(0, 0)) xlabel("Budget (fraction of nodes)") - ylabel("Distribution mean") + ylabel("Mean") ax.set_zlabel("Performance") ax.invert_xaxis() plt.savefig(dist + ".pdf") @@ -232,8 +294,8 @@ def plot_time(): if __name__ == "__main__": SYNTH_DATASETS = ["b-a", "kk", "sw", "coachella"] - DATASETS = SYNTH_DATASETS - plot_all_performances() + DATASETS = ["lp", "gp", "google", "coachella"] + #plot_all_performances() #plot_3d() #plot_hbo_likes() #compare_performance() @@ -243,5 +305,6 @@ if __name__ == "__main__": #plot_degree_distributions() #for style in plt.style.available: # plt.style.use(style) - # compare_performance("performance_" + style + ".pdf") - #compare_performance2("comp4_" + ".pdf") + # compare_performance2("comp4_" + style + ".pdf") + sampling() + #voter() |
