diff options
Diffstat (limited to 'facebook_analysis/analyze.py')
| -rw-r--r-- | facebook_analysis/analyze.py | 340 |
1 files changed, 340 insertions, 0 deletions
diff --git a/facebook_analysis/analyze.py b/facebook_analysis/analyze.py new file mode 100644 index 0000000..c5e6feb --- /dev/null +++ b/facebook_analysis/analyze.py @@ -0,0 +1,340 @@ +from bs4 import BeautifulSoup +import os.path as op +from client.tasks import normalize +from itertools import chain +from random import sample +from multiprocessing import Pool +from ads import fs, fs_p, fs_ps, fs_p_sample +import numpy as np +import pulp +import sys +from random import seed, betavariate, normalvariate +import matplotlib.pyplot as plt + +DATA_DIR = "../facebook_data" +DATASETS = ["hbo", "nyt", "lp", "google", "lmpt", "gp", "kiva", "coachella", + "peet", "gap"] +SYNTH_DIR = "../apgl" +SYNTH_DATASETS = ["b-a", "kk", "sw"] + + +def build_graph2(dataset): + users_file = op.join(DATA_DIR, dataset + "_users.txt") + seed_file = op.join(DATA_DIR, dataset + ".txt") + degrees = {} + graph = {} + with open(users_file) as f: + for line in f: + values = line.strip().split() + degrees[values[0]] = int(values[1]) + + soup = BeautifulSoup(open(seed_file)) + links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] + for link in links: + basename, fname, getname = normalize(link) + long_name = op.join(DATA_DIR, "facebook", fname) + if not op.isfile(long_name): + continue + else: + with open(long_name) as f: + friends = [normalize(line.strip())[1] for line in f] + friends = [friend for friend in friends + if (friend in degrees) and degrees[friend] > 0] + if len(friends) > 0: + friends = list(set(friends)) + friends.sort(key=degrees.get, reverse=True) + graph[fname] = friends + degrees[fname] = len(friends) + print dataset, len(graph), len(list(sd_users(graph))) + return graph, degrees + + +def build_graph1(dataset): + fname = op.join(SYNTH_DIR, dataset + ".txt") + degrees = {} + graph = {} + with open(fname) as fh: + for line in fh: + values = line.strip().split("\t") + node = int(values[0]) + friends = zip(*[iter(values[1:])] * 2) + friends = [map(int, f) for f in friends] + for friend in friends: + degrees[friend[0]] = friend[1] + graph[node] = [friend[0] for friend in friends] + degrees[node] = len(graph[node]) + print fname, len(graph), len(list(sd_users(graph))) + return graph, degrees + + +def build_graph(dataset): + if dataset in DATASETS or dataset == "big": + return build_graph2(dataset) + else: + return build_graph1(dataset) + + +def print_graph(dataset): + graph, degrees = build_graph(dataset) + with open(dataset + "_single_graph.txt", "w") as f: + for user, friends in graph.iteritems(): + friends_deg = [(friend, str(degrees[friend])) + for friend in friends] + f.write(user + "\t" + + "\t".join("\t".join(friend) for friend in friends_deg) + + "\n") + + +def sd_users(graph): + return chain.from_iterable(graph.itervalues()) + + +def random(graph, degrees, n): + #n = int(len(graph) * ratio) + values = [] + for _ in xrange(100): + users = sample(graph, n) + values.append(sum(degrees[user] for user in users)) + return sum(values) / float(len(values)) + + +def random_friend(graph, degrees, n): + #n = int(len(graph) * ratio) + values = [] + for _ in xrange(100): + users = sample(graph, n / 2) + values.append(sum(degrees[sample(graph[user], 1)[0]] + degrees[user] + for user in users)) + return sum(values) / float(len(values)) + + +def im(graph, degrees, n): + #n = int(len(graph) * ratio) + l = list(graph.iterkeys()) + l.sort(key=lambda x: degrees[x], reverse=True) + return sum(degrees[user] for user in l[:n]) + + +def aps(graph, degrees, k, p=1, sampl=False): + x = list(set(graph.keys())) + #k = int(len(graph) * ratio) # budget + P = Pool(5) + if p == 1: + m = P.map(fs, zip(range(1, k - 1), + [k] * (k - 1), + [x] * (k - 1), + [graph] * (k - 1), + [degrees] * (k - 1))) + elif type(p) is dict: + m = P.map(fs_ps, zip(range(1, k - 1), + [k] * (k - 1), + [x] * (k - 1), + [graph] * (k - 1), + [degrees] * (k - 1), + [p] * (k - 1))) + elif sampl: + m = P.map(fs_p_sample, zip(range(1, k - 1), + [k] * (k - 1), + [x] * (k - 1), + [graph] * (k - 1), + [degrees] * (k - 1), + [p] * (k - 1))) + else: + m = P.map(fs_p, zip(range(1, k - 1), + [k] * (k - 1), + [x] * (k - 1), + [graph] * (k - 1), + [degrees] * (k - 1), + [p] * (k - 1))) + + P.close() + try: + m = max(m) + except ValueError: + m = 0 + print m + return m + + +def generate_degree(degrees, p, distr="beta"): + #plt.figure() + seed() + ps = {} + if distr == "beta": + beta = 5. + alpha = 5. * p / (1 - p) + sample = lambda d: betavariate(alpha, beta) + elif distr == "gauss": + sample = lambda d: normalvariate(p, 0.01) + elif distr == "power": + alpha = 1. + beta = (1. - p) / p + sample = lambda d: betavariate(alpha, beta) + elif distr == "deg": + s = sum((1. / d) for d in degrees.itervalues() if d != 0) + c = len(list(d for d in degrees if d != 0)) * p / s + sample = lambda d: c / d if d != 0 else p + for node, deg in degrees.iteritems(): + s = sample(deg) + if s < 0.001: + ps[node] = 0. + elif s > 1.: + ps[node] = 1. + else: + ps[node] = s + #plt.hist(list(ps.itervalues()), 50) + #plt.savefig(distr + "_dist.pdf") + return ps + + +def compute_performance(dataset): + graph, degrees = build_graph(dataset) + a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] + r = (a, + [im(graph, degrees, k) for k in a], + [random(graph, degrees, k) for k in a], + [random_friend(graph, degrees, k) for k in a], + [aps(graph, degrees, k) for k in a]) + with open(dataset + "_performance.txt", "w") as f: + f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) + + +def compute_performance_p(dataset, distr=None): + graph, degrees = build_graph(dataset) + ps = np.arange(0.01, 0.99, 0.1) + a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] + if distr is None: + l = [[aps(graph, degrees, k, p) for k in a] + for p in ps] + else: + l = [[aps(graph, degrees, k, generate_degree(degrees, p, distr)) + for k in a] + for p in ps] + r = [a] + r += l + with open(dataset + "_performance_p_" + str(distr) + ".txt", "w") as f: + f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) + + +def lp(graph, degrees, k): + reverse = {} + for user in sd_users(graph): + reverse[user] = [] + for user in graph: + for friend in graph[user]: + reverse[friend].append(user) + + prob = pulp.LpProblem("ads", pulp.LpMaximize) + x = pulp.LpVariable.dicts("x", graph.keys(), 0., 1.) + y = pulp.LpVariable.dicts("y", sd_users(graph), 0., 1.) + prob += pulp.lpSum([degrees[user] * x[user] for user in graph] + + [degrees[user] * y[user] for user in sd_users(graph)]) + for user in sd_users(graph): + prob += pulp.lpSum([x[u] for u in reverse[user]] + [-y[user]]) >= 0 + prob += pulp.lpSum([x[u] for u in graph] + [y[u] for u in reverse]) <= k + prob.solve(pulp.COIN_CMD()) + print "Status:", pulp.LpStatus[prob.status] + print "Value =", pulp.value(prob.objective) + + +def lp_perf(): + graph, degrees = build_graph("peet") + a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] + r = (a, + #[aps(graph, degrees, k) for k in a], + [lp(graph, degrees, k) for k in a]) + with open("lp_running_time.txt", "w") as f: + f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) + + +def lp_time(): + graph, degrees = build_graph("big") + sp = sample(graph.keys(), int(sys.argv[2])) + graph = {s: graph[s] for s in sp} + a = int(sys.argv[1]) + print len(list(sd_users(graph))), a + lp(graph, degrees, a) + + +def aps_time(): + graph, degrees = build_graph("big") + sp = sample(graph.keys(), int(sys.argv[2])) + graph = {s: graph[s] for s in sp} + a = int(sys.argv[1]) + print len(list(sd_users(graph))), a + aps(graph, degrees, a, p=0.9, sampl=True) + + +def lp_time_big(): + graph, degrees = build_graph("big") + graph_big = {} + for i in xrange(10): + for user in graph: + graph_big[user + str(i)] = graph[user] + degrees[user + str(i)] = degrees[user] + aps(graph_big, degrees, 500) + + +def hbo_likes(): + graph, degrees = build_graph("hbo") + like_file = op.join(DATA_DIR, "hbo_likes.txt") + likes = {} + for line in open(like_file): + values = line.strip().split("\t") + if "HBO" in values[1:]: + likes[values[0]] = True + a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] + l = [aps(graph, degrees, k) for k in a] + for user in graph: + graph[user] = [friend for friend in graph[user] + if (friend in degrees and friend in likes)] + r = (a, + [im(graph, degrees, k) for k in a], + [aps(graph, degrees, k) for k in a], + l) + with open("hbo_likes_performance.txt", "w") as f: + f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) + + +def stats(): + for dataset in ["coachella"]: + graph, degrees = build_graph(dataset) + print dataset, len(graph) * 7, len(list(sd_users(graph))) * 7,\ + np.mean([degrees[u] for u in graph]),\ + np.mean([degrees[u] for u in sd_users(graph)]) + for dataset in ["nyt", "gap", "gp", "kiva"]: + graph, degrees = build_graph(dataset) + print dataset, len(graph) * 6, len(list(sd_users(graph))) * 6,\ + np.mean([degrees[u] for u in graph]),\ + np.mean([degrees[u] for u in sd_users(graph)]) + for dataset in ["google"]: + graph, degrees = build_graph(dataset) + print dataset, len(graph) * 5, len(list(sd_users(graph))) * 5,\ + np.mean([degrees[u] for u in graph]),\ + np.mean([degrees[u] for u in sd_users(graph)]) + for dataset in ["lp", "hbo", "lmpt"]: + graph, degrees = build_graph(dataset) + print dataset, len(graph) * 3, len(list(sd_users(graph))) * 3,\ + np.mean([degrees[u] for u in graph]),\ + np.mean([degrees[u] for u in sd_users(graph)]) + for dataset in ["peet"]: + graph, degrees = build_graph(dataset) + print dataset, len(graph) * 2, len(list(sd_users(graph))) * 2,\ + np.mean([degrees[u] for u in graph]),\ + np.mean([degrees[u] for u in sd_users(graph)]) + +if __name__ == "__main__": + #for dataset in SYNTH_DATASETS: + # compute_performance(dataset) + compute_performance_p("coachella", "deg") + #compute_performance("coachella") + #hbo_likes() + #lp_perf() + #lp_time() + #aps_time() + #stats() + #lp_time_big() + # _, degrees = build_graph2("coachella") + # with open("coachella_degrees.txt", "w") as fh: + # for deg in degrees.itervalues(): + # fh.write(str(deg) + "\n") |
