summaryrefslogtreecommitdiffstats
path: root/facebook_analysis/analyze.py
diff options
context:
space:
mode:
Diffstat (limited to 'facebook_analysis/analyze.py')
-rw-r--r--facebook_analysis/analyze.py340
1 files changed, 340 insertions, 0 deletions
diff --git a/facebook_analysis/analyze.py b/facebook_analysis/analyze.py
new file mode 100644
index 0000000..c5e6feb
--- /dev/null
+++ b/facebook_analysis/analyze.py
@@ -0,0 +1,340 @@
+from bs4 import BeautifulSoup
+import os.path as op
+from client.tasks import normalize
+from itertools import chain
+from random import sample
+from multiprocessing import Pool
+from ads import fs, fs_p, fs_ps, fs_p_sample
+import numpy as np
+import pulp
+import sys
+from random import seed, betavariate, normalvariate
+import matplotlib.pyplot as plt
+
+DATA_DIR = "../facebook_data"
+DATASETS = ["hbo", "nyt", "lp", "google", "lmpt", "gp", "kiva", "coachella",
+ "peet", "gap"]
+SYNTH_DIR = "../apgl"
+SYNTH_DATASETS = ["b-a", "kk", "sw"]
+
+
+def build_graph2(dataset):
+ users_file = op.join(DATA_DIR, dataset + "_users.txt")
+ seed_file = op.join(DATA_DIR, dataset + ".txt")
+ degrees = {}
+ graph = {}
+ with open(users_file) as f:
+ for line in f:
+ values = line.strip().split()
+ degrees[values[0]] = int(values[1])
+
+ soup = BeautifulSoup(open(seed_file))
+ links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+ for link in links:
+ basename, fname, getname = normalize(link)
+ long_name = op.join(DATA_DIR, "facebook", fname)
+ if not op.isfile(long_name):
+ continue
+ else:
+ with open(long_name) as f:
+ friends = [normalize(line.strip())[1] for line in f]
+ friends = [friend for friend in friends
+ if (friend in degrees) and degrees[friend] > 0]
+ if len(friends) > 0:
+ friends = list(set(friends))
+ friends.sort(key=degrees.get, reverse=True)
+ graph[fname] = friends
+ degrees[fname] = len(friends)
+ print dataset, len(graph), len(list(sd_users(graph)))
+ return graph, degrees
+
+
+def build_graph1(dataset):
+ fname = op.join(SYNTH_DIR, dataset + ".txt")
+ degrees = {}
+ graph = {}
+ with open(fname) as fh:
+ for line in fh:
+ values = line.strip().split("\t")
+ node = int(values[0])
+ friends = zip(*[iter(values[1:])] * 2)
+ friends = [map(int, f) for f in friends]
+ for friend in friends:
+ degrees[friend[0]] = friend[1]
+ graph[node] = [friend[0] for friend in friends]
+ degrees[node] = len(graph[node])
+ print fname, len(graph), len(list(sd_users(graph)))
+ return graph, degrees
+
+
+def build_graph(dataset):
+ if dataset in DATASETS or dataset == "big":
+ return build_graph2(dataset)
+ else:
+ return build_graph1(dataset)
+
+
+def print_graph(dataset):
+ graph, degrees = build_graph(dataset)
+ with open(dataset + "_single_graph.txt", "w") as f:
+ for user, friends in graph.iteritems():
+ friends_deg = [(friend, str(degrees[friend]))
+ for friend in friends]
+ f.write(user + "\t"
+ + "\t".join("\t".join(friend) for friend in friends_deg)
+ + "\n")
+
+
+def sd_users(graph):
+ return chain.from_iterable(graph.itervalues())
+
+
+def random(graph, degrees, n):
+ #n = int(len(graph) * ratio)
+ values = []
+ for _ in xrange(100):
+ users = sample(graph, n)
+ values.append(sum(degrees[user] for user in users))
+ return sum(values) / float(len(values))
+
+
+def random_friend(graph, degrees, n):
+ #n = int(len(graph) * ratio)
+ values = []
+ for _ in xrange(100):
+ users = sample(graph, n / 2)
+ values.append(sum(degrees[sample(graph[user], 1)[0]] + degrees[user]
+ for user in users))
+ return sum(values) / float(len(values))
+
+
+def im(graph, degrees, n):
+ #n = int(len(graph) * ratio)
+ l = list(graph.iterkeys())
+ l.sort(key=lambda x: degrees[x], reverse=True)
+ return sum(degrees[user] for user in l[:n])
+
+
+def aps(graph, degrees, k, p=1, sampl=False):
+ x = list(set(graph.keys()))
+ #k = int(len(graph) * ratio) # budget
+ P = Pool(5)
+ if p == 1:
+ m = P.map(fs, zip(range(1, k - 1),
+ [k] * (k - 1),
+ [x] * (k - 1),
+ [graph] * (k - 1),
+ [degrees] * (k - 1)))
+ elif type(p) is dict:
+ m = P.map(fs_ps, zip(range(1, k - 1),
+ [k] * (k - 1),
+ [x] * (k - 1),
+ [graph] * (k - 1),
+ [degrees] * (k - 1),
+ [p] * (k - 1)))
+ elif sampl:
+ m = P.map(fs_p_sample, zip(range(1, k - 1),
+ [k] * (k - 1),
+ [x] * (k - 1),
+ [graph] * (k - 1),
+ [degrees] * (k - 1),
+ [p] * (k - 1)))
+ else:
+ m = P.map(fs_p, zip(range(1, k - 1),
+ [k] * (k - 1),
+ [x] * (k - 1),
+ [graph] * (k - 1),
+ [degrees] * (k - 1),
+ [p] * (k - 1)))
+
+ P.close()
+ try:
+ m = max(m)
+ except ValueError:
+ m = 0
+ print m
+ return m
+
+
+def generate_degree(degrees, p, distr="beta"):
+ #plt.figure()
+ seed()
+ ps = {}
+ if distr == "beta":
+ beta = 5.
+ alpha = 5. * p / (1 - p)
+ sample = lambda d: betavariate(alpha, beta)
+ elif distr == "gauss":
+ sample = lambda d: normalvariate(p, 0.01)
+ elif distr == "power":
+ alpha = 1.
+ beta = (1. - p) / p
+ sample = lambda d: betavariate(alpha, beta)
+ elif distr == "deg":
+ s = sum((1. / d) for d in degrees.itervalues() if d != 0)
+ c = len(list(d for d in degrees if d != 0)) * p / s
+ sample = lambda d: c / d if d != 0 else p
+ for node, deg in degrees.iteritems():
+ s = sample(deg)
+ if s < 0.001:
+ ps[node] = 0.
+ elif s > 1.:
+ ps[node] = 1.
+ else:
+ ps[node] = s
+ #plt.hist(list(ps.itervalues()), 50)
+ #plt.savefig(distr + "_dist.pdf")
+ return ps
+
+
+def compute_performance(dataset):
+ graph, degrees = build_graph(dataset)
+ a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)]
+ r = (a,
+ [im(graph, degrees, k) for k in a],
+ [random(graph, degrees, k) for k in a],
+ [random_friend(graph, degrees, k) for k in a],
+ [aps(graph, degrees, k) for k in a])
+ with open(dataset + "_performance.txt", "w") as f:
+ f.write("\n".join("\t".join(map(str, k)) for k in zip(*r)))
+
+
+def compute_performance_p(dataset, distr=None):
+ graph, degrees = build_graph(dataset)
+ ps = np.arange(0.01, 0.99, 0.1)
+ a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)]
+ if distr is None:
+ l = [[aps(graph, degrees, k, p) for k in a]
+ for p in ps]
+ else:
+ l = [[aps(graph, degrees, k, generate_degree(degrees, p, distr))
+ for k in a]
+ for p in ps]
+ r = [a]
+ r += l
+ with open(dataset + "_performance_p_" + str(distr) + ".txt", "w") as f:
+ f.write("\n".join("\t".join(map(str, k)) for k in zip(*r)))
+
+
+def lp(graph, degrees, k):
+ reverse = {}
+ for user in sd_users(graph):
+ reverse[user] = []
+ for user in graph:
+ for friend in graph[user]:
+ reverse[friend].append(user)
+
+ prob = pulp.LpProblem("ads", pulp.LpMaximize)
+ x = pulp.LpVariable.dicts("x", graph.keys(), 0., 1.)
+ y = pulp.LpVariable.dicts("y", sd_users(graph), 0., 1.)
+ prob += pulp.lpSum([degrees[user] * x[user] for user in graph] +
+ [degrees[user] * y[user] for user in sd_users(graph)])
+ for user in sd_users(graph):
+ prob += pulp.lpSum([x[u] for u in reverse[user]] + [-y[user]]) >= 0
+ prob += pulp.lpSum([x[u] for u in graph] + [y[u] for u in reverse]) <= k
+ prob.solve(pulp.COIN_CMD())
+ print "Status:", pulp.LpStatus[prob.status]
+ print "Value =", pulp.value(prob.objective)
+
+
+def lp_perf():
+ graph, degrees = build_graph("peet")
+ a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)]
+ r = (a,
+ #[aps(graph, degrees, k) for k in a],
+ [lp(graph, degrees, k) for k in a])
+ with open("lp_running_time.txt", "w") as f:
+ f.write("\n".join("\t".join(map(str, k)) for k in zip(*r)))
+
+
+def lp_time():
+ graph, degrees = build_graph("big")
+ sp = sample(graph.keys(), int(sys.argv[2]))
+ graph = {s: graph[s] for s in sp}
+ a = int(sys.argv[1])
+ print len(list(sd_users(graph))), a
+ lp(graph, degrees, a)
+
+
+def aps_time():
+ graph, degrees = build_graph("big")
+ sp = sample(graph.keys(), int(sys.argv[2]))
+ graph = {s: graph[s] for s in sp}
+ a = int(sys.argv[1])
+ print len(list(sd_users(graph))), a
+ aps(graph, degrees, a, p=0.9, sampl=True)
+
+
+def lp_time_big():
+ graph, degrees = build_graph("big")
+ graph_big = {}
+ for i in xrange(10):
+ for user in graph:
+ graph_big[user + str(i)] = graph[user]
+ degrees[user + str(i)] = degrees[user]
+ aps(graph_big, degrees, 500)
+
+
+def hbo_likes():
+ graph, degrees = build_graph("hbo")
+ like_file = op.join(DATA_DIR, "hbo_likes.txt")
+ likes = {}
+ for line in open(like_file):
+ values = line.strip().split("\t")
+ if "HBO" in values[1:]:
+ likes[values[0]] = True
+ a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)]
+ l = [aps(graph, degrees, k) for k in a]
+ for user in graph:
+ graph[user] = [friend for friend in graph[user]
+ if (friend in degrees and friend in likes)]
+ r = (a,
+ [im(graph, degrees, k) for k in a],
+ [aps(graph, degrees, k) for k in a],
+ l)
+ with open("hbo_likes_performance.txt", "w") as f:
+ f.write("\n".join("\t".join(map(str, k)) for k in zip(*r)))
+
+
+def stats():
+ for dataset in ["coachella"]:
+ graph, degrees = build_graph(dataset)
+ print dataset, len(graph) * 7, len(list(sd_users(graph))) * 7,\
+ np.mean([degrees[u] for u in graph]),\
+ np.mean([degrees[u] for u in sd_users(graph)])
+ for dataset in ["nyt", "gap", "gp", "kiva"]:
+ graph, degrees = build_graph(dataset)
+ print dataset, len(graph) * 6, len(list(sd_users(graph))) * 6,\
+ np.mean([degrees[u] for u in graph]),\
+ np.mean([degrees[u] for u in sd_users(graph)])
+ for dataset in ["google"]:
+ graph, degrees = build_graph(dataset)
+ print dataset, len(graph) * 5, len(list(sd_users(graph))) * 5,\
+ np.mean([degrees[u] for u in graph]),\
+ np.mean([degrees[u] for u in sd_users(graph)])
+ for dataset in ["lp", "hbo", "lmpt"]:
+ graph, degrees = build_graph(dataset)
+ print dataset, len(graph) * 3, len(list(sd_users(graph))) * 3,\
+ np.mean([degrees[u] for u in graph]),\
+ np.mean([degrees[u] for u in sd_users(graph)])
+ for dataset in ["peet"]:
+ graph, degrees = build_graph(dataset)
+ print dataset, len(graph) * 2, len(list(sd_users(graph))) * 2,\
+ np.mean([degrees[u] for u in graph]),\
+ np.mean([degrees[u] for u in sd_users(graph)])
+
+if __name__ == "__main__":
+ #for dataset in SYNTH_DATASETS:
+ # compute_performance(dataset)
+ compute_performance_p("coachella", "deg")
+ #compute_performance("coachella")
+ #hbo_likes()
+ #lp_perf()
+ #lp_time()
+ #aps_time()
+ #stats()
+ #lp_time_big()
+ # _, degrees = build_graph2("coachella")
+ # with open("coachella_degrees.txt", "w") as fh:
+ # for deg in degrees.itervalues():
+ # fh.write(str(deg) + "\n")