from bs4 import BeautifulSoup import os.path as op from client.tasks import normalize from itertools import chain from random import sample from multiprocessing import Pool from ads import fs, fs_p, fs_ps, fs_p_sample import numpy as np import pulp import sys from random import seed, betavariate, normalvariate import matplotlib.pyplot as plt DATA_DIR = "../facebook_data" DATASETS = ["hbo", "nyt", "lp", "google", "lmpt", "gp", "kiva", "coachella", "peet", "gap"] SYNTH_DIR = "../apgl" SYNTH_DATASETS = ["b-a", "kk", "sw"] def build_graph2(dataset): users_file = op.join(DATA_DIR, dataset + "_users.txt") seed_file = op.join(DATA_DIR, dataset + ".txt") degrees = {} graph = {} with open(users_file) as f: for line in f: values = line.strip().split() degrees[values[0]] = int(values[1]) soup = BeautifulSoup(open(seed_file)) links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] for link in links: basename, fname, getname = normalize(link) long_name = op.join(DATA_DIR, "facebook", fname) if not op.isfile(long_name): continue else: with open(long_name) as f: friends = [normalize(line.strip())[1] for line in f] friends = [friend for friend in friends if (friend in degrees) and degrees[friend] > 0] if len(friends) > 0: friends = list(set(friends)) friends.sort(key=degrees.get, reverse=True) graph[fname] = friends degrees[fname] = len(friends) print dataset, len(graph), len(list(sd_users(graph))) return graph, degrees def build_graph1(dataset): fname = op.join(SYNTH_DIR, dataset + ".txt") degrees = {} graph = {} with open(fname) as fh: for line in fh: values = line.strip().split("\t") node = int(values[0]) friends = zip(*[iter(values[1:])] * 2) friends = [map(int, f) for f in friends] for friend in friends: degrees[friend[0]] = friend[1] graph[node] = [friend[0] for friend in friends] degrees[node] = len(graph[node]) print fname, len(graph), len(list(sd_users(graph))) return graph, degrees def build_graph(dataset): if dataset in DATASETS or dataset == "big": return build_graph2(dataset) else: return build_graph1(dataset) def print_graph(dataset): graph, degrees = build_graph(dataset) with open(dataset + "_single_graph.txt", "w") as f: for user, friends in graph.iteritems(): friends_deg = [(friend, str(degrees[friend])) for friend in friends] f.write(user + "\t" + "\t".join("\t".join(friend) for friend in friends_deg) + "\n") def sd_users(graph): return chain.from_iterable(graph.itervalues()) def random(graph, degrees, n): #n = int(len(graph) * ratio) values = [] for _ in xrange(100): users = sample(graph, n) values.append(sum(degrees[user] for user in users)) return sum(values) / float(len(values)) def random_friend(graph, degrees, n): #n = int(len(graph) * ratio) values = [] for _ in xrange(100): users = sample(graph, n / 2) values.append(sum(degrees[sample(graph[user], 1)[0]] + degrees[user] for user in users)) return sum(values) / float(len(values)) def im(graph, degrees, n): #n = int(len(graph) * ratio) l = list(graph.iterkeys()) l.sort(key=lambda x: degrees[x], reverse=True) return sum(degrees[user] for user in l[:n]) def aps(graph, degrees, k, p=1, sampl=False): x = list(set(graph.keys())) #k = int(len(graph) * ratio) # budget P = Pool(5) if p == 1: m = P.map(fs, zip(range(1, k - 1), [k] * (k - 1), [x] * (k - 1), [graph] * (k - 1), [degrees] * (k - 1))) elif type(p) is dict: m = P.map(fs_ps, zip(range(1, k - 1), [k] * (k - 1), [x] * (k - 1), [graph] * (k - 1), [degrees] * (k - 1), [p] * (k - 1))) elif sampl: m = P.map(fs_p_sample, zip(range(1, k - 1), [k] * (k - 1), [x] * (k - 1), [graph] * (k - 1), [degrees] * (k - 1), [p] * (k - 1))) else: m = P.map(fs_p, zip(range(1, k - 1), [k] * (k - 1), [x] * (k - 1), [graph] * (k - 1), [degrees] * (k - 1), [p] * (k - 1))) P.close() try: m = max(m) except ValueError: m = 0 print m return m def generate_degree(degrees, p, distr="beta"): #plt.figure() seed() ps = {} if distr == "beta": beta = 5. alpha = 5. * p / (1 - p) sample = lambda d: betavariate(alpha, beta) elif distr == "gauss": sample = lambda d: normalvariate(p, 0.01) elif distr == "power": alpha = 1. beta = (1. - p) / p sample = lambda d: betavariate(alpha, beta) elif distr == "deg": s = sum((1. / d) for d in degrees.itervalues() if d != 0) c = len(list(d for d in degrees if d != 0)) * p / s sample = lambda d: c / d if d != 0 else p for node, deg in degrees.iteritems(): s = sample(deg) if s < 0.001: ps[node] = 0. elif s > 1.: ps[node] = 1. else: ps[node] = s #plt.hist(list(ps.itervalues()), 50) #plt.savefig(distr + "_dist.pdf") return ps def compute_performance(dataset): graph, degrees = build_graph(dataset) a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] r = (a, [im(graph, degrees, k) for k in a], [random(graph, degrees, k) for k in a], [random_friend(graph, degrees, k) for k in a], [aps(graph, degrees, k) for k in a]) with open(dataset + "_performance.txt", "w") as f: f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) def compute_performance_p(dataset, distr=None): graph, degrees = build_graph(dataset) ps = np.arange(0.01, 0.99, 0.1) a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] if distr is None: l = [[aps(graph, degrees, k, p) for k in a] for p in ps] else: l = [[aps(graph, degrees, k, generate_degree(degrees, p, distr)) for k in a] for p in ps] r = [a] r += l with open(dataset + "_performance_p_" + str(distr) + ".txt", "w") as f: f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) def lp(graph, degrees, k): reverse = {} for user in sd_users(graph): reverse[user] = [] for user in graph: for friend in graph[user]: reverse[friend].append(user) prob = pulp.LpProblem("ads", pulp.LpMaximize) x = pulp.LpVariable.dicts("x", graph.keys(), 0., 1.) y = pulp.LpVariable.dicts("y", sd_users(graph), 0., 1.) prob += pulp.lpSum([degrees[user] * x[user] for user in graph] + [degrees[user] * y[user] for user in sd_users(graph)]) for user in sd_users(graph): prob += pulp.lpSum([x[u] for u in reverse[user]] + [-y[user]]) >= 0 prob += pulp.lpSum([x[u] for u in graph] + [y[u] for u in reverse]) <= k prob.solve(pulp.COIN_CMD()) print "Status:", pulp.LpStatus[prob.status] print "Value =", pulp.value(prob.objective) def lp_perf(): graph, degrees = build_graph("peet") a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] r = (a, #[aps(graph, degrees, k) for k in a], [lp(graph, degrees, k) for k in a]) with open("lp_running_time.txt", "w") as f: f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) def lp_time(): graph, degrees = build_graph("big") sp = sample(graph.keys(), int(sys.argv[2])) graph = {s: graph[s] for s in sp} a = int(sys.argv[1]) print len(list(sd_users(graph))), a lp(graph, degrees, a) def aps_time(): graph, degrees = build_graph("big") sp = sample(graph.keys(), int(sys.argv[2])) graph = {s: graph[s] for s in sp} a = int(sys.argv[1]) print len(list(sd_users(graph))), a aps(graph, degrees, a, p=0.9, sampl=True) def lp_time_big(): graph, degrees = build_graph("big") graph_big = {} for i in xrange(10): for user in graph: graph_big[user + str(i)] = graph[user] degrees[user + str(i)] = degrees[user] aps(graph_big, degrees, 500) def hbo_likes(): graph, degrees = build_graph("hbo") like_file = op.join(DATA_DIR, "hbo_likes.txt") likes = {} for line in open(like_file): values = line.strip().split("\t") if "HBO" in values[1:]: likes[values[0]] = True a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] l = [aps(graph, degrees, k) for k in a] for user in graph: graph[user] = [friend for friend in graph[user] if (friend in degrees and friend in likes)] r = (a, [im(graph, degrees, k) for k in a], [aps(graph, degrees, k) for k in a], l) with open("hbo_likes_performance.txt", "w") as f: f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) def stats(): for dataset in ["coachella"]: graph, degrees = build_graph(dataset) print dataset, len(graph) * 7, len(list(sd_users(graph))) * 7,\ np.mean([degrees[u] for u in graph]),\ np.mean([degrees[u] for u in sd_users(graph)]) for dataset in ["nyt", "gap", "gp", "kiva"]: graph, degrees = build_graph(dataset) print dataset, len(graph) * 6, len(list(sd_users(graph))) * 6,\ np.mean([degrees[u] for u in graph]),\ np.mean([degrees[u] for u in sd_users(graph)]) for dataset in ["google"]: graph, degrees = build_graph(dataset) print dataset, len(graph) * 5, len(list(sd_users(graph))) * 5,\ np.mean([degrees[u] for u in graph]),\ np.mean([degrees[u] for u in sd_users(graph)]) for dataset in ["lp", "hbo", "lmpt"]: graph, degrees = build_graph(dataset) print dataset, len(graph) * 3, len(list(sd_users(graph))) * 3,\ np.mean([degrees[u] for u in graph]),\ np.mean([degrees[u] for u in sd_users(graph)]) for dataset in ["peet"]: graph, degrees = build_graph(dataset) print dataset, len(graph) * 2, len(list(sd_users(graph))) * 2,\ np.mean([degrees[u] for u in graph]),\ np.mean([degrees[u] for u in sd_users(graph)]) if __name__ == "__main__": #for dataset in SYNTH_DATASETS: # compute_performance(dataset) compute_performance_p("coachella", "deg") #compute_performance("coachella") #hbo_likes() #lp_perf() #lp_time() #aps_time() #stats() #lp_time_big() # _, degrees = build_graph2("coachella") # with open("coachella_degrees.txt", "w") as fh: # for deg in degrees.itervalues(): # fh.write(str(deg) + "\n")