From ece1d828d53d6123fcecb5ea8bf9b126d1728ccc Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Fri, 24 Oct 2014 12:16:51 -0400 Subject: Add code --- Makefile | 8 - apgl/main.py | 66 +++++ distribution.py | 30 --- facebook_analysis/Makefile | 3 + facebook_analysis/ads.pyx | 403 ++++++++++++++++++++++++++++++ facebook_analysis/analyze.py | 340 +++++++++++++++++++++++++ facebook_analysis/seed.py | 247 ++++++++++++++++++ facebook_analysis/setup.py | 4 + facebook_scraping/Makefile | 49 ++++ facebook_scraping/client/Makefile | 15 ++ facebook_scraping/client/__init__.py | 0 facebook_scraping/client/requirements.txt | 4 + facebook_scraping/client/tasks.py | 243 ++++++++++++++++++ facebook_scraping/limits.py | 6 + facebook_scraping/mturk.py | 16 ++ facebook_scraping/run.py | 91 +++++++ facebook_scraping/run2.py | 90 +++++++ facebook_scraping/seed.py | 7 + facebook_scraping/server.py | 16 ++ fb_accounts.txt | 2 - requirements.txt | 4 - run.py | 73 ------ seed.txt | 1 - server.py | 16 -- tasks.py | 111 -------- twitter/api_accounts.txt | 1 - twitter/dispatcher.py | 2 - twitter/scraper.py | 3 +- twitter/stream.py | 7 +- 29 files changed, 1604 insertions(+), 254 deletions(-) delete mode 100644 Makefile create mode 100644 apgl/main.py delete mode 100644 distribution.py create mode 100644 facebook_analysis/Makefile create mode 100644 facebook_analysis/ads.pyx create mode 100644 facebook_analysis/analyze.py create mode 100644 facebook_analysis/seed.py create mode 100644 facebook_analysis/setup.py create mode 100644 facebook_scraping/Makefile create mode 100644 facebook_scraping/client/Makefile create mode 100644 facebook_scraping/client/__init__.py create mode 100644 facebook_scraping/client/requirements.txt create mode 100644 facebook_scraping/client/tasks.py create mode 100644 facebook_scraping/limits.py create mode 100644 facebook_scraping/mturk.py create mode 100644 facebook_scraping/run.py create mode 100644 facebook_scraping/run2.py create mode 100644 facebook_scraping/seed.py create mode 100644 facebook_scraping/server.py delete mode 100644 fb_accounts.txt delete mode 100644 requirements.txt delete mode 100644 run.py delete mode 100644 seed.txt delete mode 100644 server.py delete mode 100644 tasks.py diff --git a/Makefile b/Makefile deleted file mode 100644 index 61478d4..0000000 --- a/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -all: boostrap run - -boostrap: - wget http://thibaut.horel.org/facebook/facebook.tar.gz - tar -xzf facebook.tar.gz - -run: - celery -A tasks --concurrency=1 worker -l info diff --git a/apgl/main.py b/apgl/main.py new file mode 100644 index 0000000..c9d5f11 --- /dev/null +++ b/apgl/main.py @@ -0,0 +1,66 @@ +from apgl.graph import SparseGraph +from apgl.generator.BarabasiAlbertGenerator import BarabasiAlbertGenerator +from apgl.generator.SmallWorldGenerator import SmallWorldGenerator +from apgl.generator.KroneckerGenerator import KroneckerGenerator +from apgl.generator.ConfigModelGenerator import ConfigModelGenerator +from random import sample +from math import log +import numpy as np + +vertices = 10000 + + +def pgraph(l, name): + s = sample(range(len(l)), int(len(l) / 100.)) + with open(name, "w") as fh: + for i in s: + friends = [(f, len(l[f])) for f in l[i]] + line = str(i) + "\t" + "\t".join("\t".join(map(str, a)) + for a in friends) + fh.write(line + "\n") + + +def ba(): + graph = SparseGraph(vertices) + generator = BarabasiAlbertGenerator(10, 10) + graph = generator.generate(graph) + l, _ = graph.adjacencyList() + pgraph(l, "b-a.txt") + + +def sw(): + # slow + graph = SparseGraph(vertices) + generator = SmallWorldGenerator(0.3, 50) + graph = generator.generate(graph) + l, _ = graph.adjacencyList() + pgraph(l, "sw.txt") + + +def kk(): + init = SparseGraph(4) + init[0, 1] = 1 + init[0, 2] = 1 + init[0, 3] = 1 + for i in range(4): + init[i, i] = 1 + k = int(log(vertices, 4)) + 1 + generator = KroneckerGenerator(init, k) + graph = generator.generate() + l, _ = graph.adjacencyList() + pgraph(l, "kk.txt") + + +def cm(): + with open("../facebook_analysis/coachella_degrees.txt") as fh: + l = [int(line.strip()) for line in fh] + l = np.array(l) + n = len(l) + graph = SparseGraph(n) + generator = ConfigModelGenerator(l) + graph = generator.generate(graph) + l, _ = graph.adjacencyList() + pgraph(l, "cm.txt") + + +cm() diff --git a/distribution.py b/distribution.py deleted file mode 100644 index 49d2d5e..0000000 --- a/distribution.py +++ /dev/null @@ -1,30 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -import sys - - -def load_distribution(filename): - l = [int(line.strip()) for line in open(filename)] - l = sorted(l) - l = l[:] - return l - - -def plot_distribution(files): - for file in files: - x = load_distribution(file) - a = np.array(x) - print file, a.mean(), a.size - n, bins, patches = plt.hist(x, 50, normed=1, facecolor='green') - plt.show() - # # n_nodes = float(sum(y)) - # plt.plot(x, np.array(y), ".") - # if save: - # fig = plt.gcf() - # fig.set_size_inches(20, 15) - # # plt.savefig(output, bbox_inches="tight") - # else: - # plt.show() - -if __name__ == "__main__": - plot_distribution(sys.argv[1:]) diff --git a/facebook_analysis/Makefile b/facebook_analysis/Makefile new file mode 100644 index 0000000..e3df848 --- /dev/null +++ b/facebook_analysis/Makefile @@ -0,0 +1,3 @@ +all: + python2 setup.py build_ext --inplace + cython2 -a ads.pyx diff --git a/facebook_analysis/ads.pyx b/facebook_analysis/ads.pyx new file mode 100644 index 0000000..2682456 --- /dev/null +++ b/facebook_analysis/ads.pyx @@ -0,0 +1,403 @@ +#!python +#cython: boundscheck=False, nonecheck=False +cimport cython +import random + + +@cython.nonecheck(False) +cdef int merge_opt(list l1, list l2, int t, dict degrees): + cdef int n1, n2, i, j, n, s + n1 = len(l1) + n2 = len(l2) + i = j = n = s = 0 + while n < t: + if i == n1 and j == n2: + break + if i == n1: + s += degrees[l2[j]] + j += 1 + n += 1 + continue + if j == n2: + s += degrees[l1[i]] + i += 1 + n += 1 + continue + + li, lj = l1[i], l2[j] + di, dj = degrees[li], degrees[lj] + if lj == li: + j += 1 + s += di + i += 1 + n += 1 + elif di > dj: + s += di + i += 1 + n += 1 + else: + s += dj + j += 1 + n += 1 + return s + +@cython.nonecheck(False) +cdef float merge_opt_p(list l1, list l2, int t, dict degrees, float p): + cdef int n1, n2, i, j + cdef float n, s + n1 = len(l1) + n2 = len(l2) + i = j = 0 + n = s = 0. + while n < t: + if i == n1 and j == n2: + break + if i == n1: + s += degrees[l2[j]]*p + j += 1 + n += p + continue + if j == n2: + s += degrees[l1[i]]*p + i += 1 + n += p + continue + + li, lj = l1[i], l2[j] + di, dj = degrees[li], degrees[lj] + if lj == li: + j += 1 + s += di*p + i += 1 + n += p + elif di > dj: + s += di*p + i += 1 + n += p + else: + s += dj*p + j += 1 + n += p + return s + + +@cython.nonecheck(False) +cdef float merge_opt_p_sample(list l1, list l2, int t, dict degrees, float p): + random.seed() + cdef int n1, n2, i, j + cdef float n, s + n1 = len(l1) + n2 = len(l2) + i = j = 0 + n = s = 0. + cdef int k + cdef dict a + cdef float r + k = 0 + r = 0 + a = {} + for k in xrange(len(degrees)**2): + for d in degrees: + a[d] = random.random() + while n < t: + if i == n1 and j == n2: + break + if i == n1: + s += degrees[l2[j]]*p + j += 1 + n += p + continue + if j == n2: + s += degrees[l1[i]]*p + i += 1 + n += p + continue + + li, lj = l1[i], l2[j] + di, dj = degrees[li], degrees[lj] + if lj == li: + j += 1 + s += di*p + i += 1 + n += p + elif di > dj: + s += di*p + i += 1 + n += p + else: + s += dj*p + j += 1 + n += p + r += s + r /= n**2 + return r + + +@cython.nonecheck(False) +cdef float merge_opt_ps(list l1, list l2, int t, dict degrees, dict p): + cdef int n1, n2, i, j + cdef float n, s + n1 = len(l1) + n2 = len(l2) + i = j = 0 + n = s = 0. + while n < t: + if i == n1 and j == n2: + break + if i == n1: + s += degrees[l2[j]]*p[l2[j]] + n += p[l2[j]] + j += 1 + continue + if j == n2: + s += degrees[l1[i]]*p[l1[i]] + n += p[l1[i]] + i += 1 + continue + + li, lj = l1[i], l2[j] + di, dj = degrees[li], degrees[lj] + if lj == li: + j += 1 + s += di*p[li] + n += p[li] + i += 1 + elif di > dj: + s += di*p[li] + n += p[li] + i += 1 + else: + s += dj*p[lj] + n += p[lj] + j += 1 + return s + + +@cython.nonecheck(False) +cdef list merge(list l1, list l2, int t, dict degrees): + cdef int n1, n2, i, j, n + n1 = len(l1) + n2 = len(l2) + result = [] + i = j = n = 0 + while n < t: + if i == n1 and j == n2: + break + if i == n1: + result.append(l2[j]) + j += 1 + n += 1 + continue + if j == n2: + result.append(l1[i]) + i += 1 + n += 1 + continue + + if l2[j] == l1[i]: + j += 1 + result.append(l1[i]) + i += 1 + n += 1 + elif degrees[l1[i]] > degrees[l2[j]]: + result.append(l1[i]) + i += 1 + n += 1 + else: + result.append(l2[j]) + j += 1 + n += 1 + return result + +@cython.nonecheck(False) +cdef list merge_p(list l1, list l2, int t, dict degrees, float p): + cdef int n1, n2, i, j + cdef float n + n1 = len(l1) + n2 = len(l2) + result = [] + i = j = 0 + n = 0. + while n < t: + if i == n1 and j == n2: + break + if i == n1: + result.append(l2[j]) + j += 1 + n += p + continue + if j == n2: + result.append(l1[i]) + i += 1 + n += p + continue + + if l2[j] == l1[i]: + j += 1 + result.append(l1[i]) + i += 1 + n += p + elif degrees[l1[i]] > degrees[l2[j]]: + result.append(l1[i]) + i += 1 + n += p + else: + result.append(l2[j]) + j += 1 + n += p + return result + + +@cython.nonecheck(False) +cdef list merge_ps(list l1, list l2, int t, dict degrees, dict p): + cdef int n1, n2, i, j + cdef float n + n1 = len(l1) + n2 = len(l2) + result = [] + i = j = 0 + n = 0. + while n < t: + if i == n1 and j == n2: + break + if i == n1: + result.append(l2[j]) + n += p[l2[j]] + j += 1 + continue + if j == n2: + result.append(l1[i]) + n += p[l1[i]] + i += 1 + continue + + if l2[j] == l1[i]: + j += 1 + result.append(l1[i]) + n += p[l1[i]] + i += 1 + elif degrees[l1[i]] > degrees[l2[j]]: + result.append(l1[i]) + n += p[l1[i]] + i += 1 + else: + result.append(l2[j]) + n += p[l2[j]] + j += 1 + return result + +@cython.nonecheck(False) +def fs(tuple a): + cdef int cur_val, best_diff, best_value, o, t, k + cdef list x + cdef dict graph + cdef dict degrees + t, k, x, graph, degrees = a + cdef list n + cur_val = 0 + n = [] # neighbors of greedy set + for i in range(1, k - t): + best_diff = 0 + best_user = None + best_value = 0 + for user in x: + o = merge_opt(n, graph[user], t, degrees) + if o - cur_val > best_diff: + best_diff = o - cur_val + best_user = user + best_value = o + if best_user is not None: + x.remove(best_user) + cur_val = best_value + n = merge(n, graph[best_user], t, degrees) + else: + break + return cur_val + +@cython.nonecheck(False) +def fs_p(tuple a): + cdef int t, k + cdef float o, p, best_value, best_diff, cur_val + cdef list x + cdef dict graph + cdef dict degrees + t, k, x, graph, degrees, p = a + cdef list n + cur_val = 0 + n = [] # neighbors of greedy set + for i in range(1, k - t): + best_diff = 0 + best_user = None + best_value = 0 + for user in x: + o = merge_opt_p(n, graph[user], t, degrees, p) + if o - cur_val > best_diff: + best_diff = o - cur_val + best_user = user + best_value = o + if best_user is not None: + x.remove(best_user) + cur_val = best_value + n = merge_p(n, graph[best_user], t, degrees, p) + else: + break + return cur_val + +@cython.nonecheck(False) +def fs_p_sample(tuple a): + cdef int t, k + cdef float o, p, best_value, best_diff, cur_val + cdef list x + cdef dict graph + cdef dict degrees + t, k, x, graph, degrees, p = a + cdef list n + cur_val = 0 + n = [] # neighbors of greedy set + for i in range(1, k - t): + best_diff = 0 + best_user = None + best_value = 0 + for user in x: + o = merge_opt_p_sample(n, graph[user], t, degrees, p) + if o - cur_val > best_diff: + best_diff = o - cur_val + best_user = user + best_value = o + if best_user is not None: + x.remove(best_user) + cur_val = best_value + n = merge_p(n, graph[best_user], t, degrees, p) + else: + break + return cur_val + +@cython.nonecheck(False) +def fs_ps(tuple a): + cdef int t, k + cdef float o, best_value, best_diff, cur_val + cdef list x + cdef dict graph + cdef dict degrees + cdef dict p + t, k, x, graph, degrees, p = a + cdef list n + cur_val = 0 + n = [] # neighbors of greedy set + for i in range(1, k - t): + best_diff = 0 + best_user = None + best_value = 0 + for user in x: + o = merge_opt_ps(n, graph[user], t, degrees, p) + if o - cur_val > best_diff: + best_diff = o - cur_val + best_user = user + best_value = o + if best_user is not None: + x.remove(best_user) + cur_val = best_value + n = merge_ps(n, graph[best_user], t, degrees, p) + else: + break + return cur_val diff --git a/facebook_analysis/analyze.py b/facebook_analysis/analyze.py new file mode 100644 index 0000000..c5e6feb --- /dev/null +++ b/facebook_analysis/analyze.py @@ -0,0 +1,340 @@ +from bs4 import BeautifulSoup +import os.path as op +from client.tasks import normalize +from itertools import chain +from random import sample +from multiprocessing import Pool +from ads import fs, fs_p, fs_ps, fs_p_sample +import numpy as np +import pulp +import sys +from random import seed, betavariate, normalvariate +import matplotlib.pyplot as plt + +DATA_DIR = "../facebook_data" +DATASETS = ["hbo", "nyt", "lp", "google", "lmpt", "gp", "kiva", "coachella", + "peet", "gap"] +SYNTH_DIR = "../apgl" +SYNTH_DATASETS = ["b-a", "kk", "sw"] + + +def build_graph2(dataset): + users_file = op.join(DATA_DIR, dataset + "_users.txt") + seed_file = op.join(DATA_DIR, dataset + ".txt") + degrees = {} + graph = {} + with open(users_file) as f: + for line in f: + values = line.strip().split() + degrees[values[0]] = int(values[1]) + + soup = BeautifulSoup(open(seed_file)) + links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] + for link in links: + basename, fname, getname = normalize(link) + long_name = op.join(DATA_DIR, "facebook", fname) + if not op.isfile(long_name): + continue + else: + with open(long_name) as f: + friends = [normalize(line.strip())[1] for line in f] + friends = [friend for friend in friends + if (friend in degrees) and degrees[friend] > 0] + if len(friends) > 0: + friends = list(set(friends)) + friends.sort(key=degrees.get, reverse=True) + graph[fname] = friends + degrees[fname] = len(friends) + print dataset, len(graph), len(list(sd_users(graph))) + return graph, degrees + + +def build_graph1(dataset): + fname = op.join(SYNTH_DIR, dataset + ".txt") + degrees = {} + graph = {} + with open(fname) as fh: + for line in fh: + values = line.strip().split("\t") + node = int(values[0]) + friends = zip(*[iter(values[1:])] * 2) + friends = [map(int, f) for f in friends] + for friend in friends: + degrees[friend[0]] = friend[1] + graph[node] = [friend[0] for friend in friends] + degrees[node] = len(graph[node]) + print fname, len(graph), len(list(sd_users(graph))) + return graph, degrees + + +def build_graph(dataset): + if dataset in DATASETS or dataset == "big": + return build_graph2(dataset) + else: + return build_graph1(dataset) + + +def print_graph(dataset): + graph, degrees = build_graph(dataset) + with open(dataset + "_single_graph.txt", "w") as f: + for user, friends in graph.iteritems(): + friends_deg = [(friend, str(degrees[friend])) + for friend in friends] + f.write(user + "\t" + + "\t".join("\t".join(friend) for friend in friends_deg) + + "\n") + + +def sd_users(graph): + return chain.from_iterable(graph.itervalues()) + + +def random(graph, degrees, n): + #n = int(len(graph) * ratio) + values = [] + for _ in xrange(100): + users = sample(graph, n) + values.append(sum(degrees[user] for user in users)) + return sum(values) / float(len(values)) + + +def random_friend(graph, degrees, n): + #n = int(len(graph) * ratio) + values = [] + for _ in xrange(100): + users = sample(graph, n / 2) + values.append(sum(degrees[sample(graph[user], 1)[0]] + degrees[user] + for user in users)) + return sum(values) / float(len(values)) + + +def im(graph, degrees, n): + #n = int(len(graph) * ratio) + l = list(graph.iterkeys()) + l.sort(key=lambda x: degrees[x], reverse=True) + return sum(degrees[user] for user in l[:n]) + + +def aps(graph, degrees, k, p=1, sampl=False): + x = list(set(graph.keys())) + #k = int(len(graph) * ratio) # budget + P = Pool(5) + if p == 1: + m = P.map(fs, zip(range(1, k - 1), + [k] * (k - 1), + [x] * (k - 1), + [graph] * (k - 1), + [degrees] * (k - 1))) + elif type(p) is dict: + m = P.map(fs_ps, zip(range(1, k - 1), + [k] * (k - 1), + [x] * (k - 1), + [graph] * (k - 1), + [degrees] * (k - 1), + [p] * (k - 1))) + elif sampl: + m = P.map(fs_p_sample, zip(range(1, k - 1), + [k] * (k - 1), + [x] * (k - 1), + [graph] * (k - 1), + [degrees] * (k - 1), + [p] * (k - 1))) + else: + m = P.map(fs_p, zip(range(1, k - 1), + [k] * (k - 1), + [x] * (k - 1), + [graph] * (k - 1), + [degrees] * (k - 1), + [p] * (k - 1))) + + P.close() + try: + m = max(m) + except ValueError: + m = 0 + print m + return m + + +def generate_degree(degrees, p, distr="beta"): + #plt.figure() + seed() + ps = {} + if distr == "beta": + beta = 5. + alpha = 5. * p / (1 - p) + sample = lambda d: betavariate(alpha, beta) + elif distr == "gauss": + sample = lambda d: normalvariate(p, 0.01) + elif distr == "power": + alpha = 1. + beta = (1. - p) / p + sample = lambda d: betavariate(alpha, beta) + elif distr == "deg": + s = sum((1. / d) for d in degrees.itervalues() if d != 0) + c = len(list(d for d in degrees if d != 0)) * p / s + sample = lambda d: c / d if d != 0 else p + for node, deg in degrees.iteritems(): + s = sample(deg) + if s < 0.001: + ps[node] = 0. + elif s > 1.: + ps[node] = 1. + else: + ps[node] = s + #plt.hist(list(ps.itervalues()), 50) + #plt.savefig(distr + "_dist.pdf") + return ps + + +def compute_performance(dataset): + graph, degrees = build_graph(dataset) + a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] + r = (a, + [im(graph, degrees, k) for k in a], + [random(graph, degrees, k) for k in a], + [random_friend(graph, degrees, k) for k in a], + [aps(graph, degrees, k) for k in a]) + with open(dataset + "_performance.txt", "w") as f: + f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) + + +def compute_performance_p(dataset, distr=None): + graph, degrees = build_graph(dataset) + ps = np.arange(0.01, 0.99, 0.1) + a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] + if distr is None: + l = [[aps(graph, degrees, k, p) for k in a] + for p in ps] + else: + l = [[aps(graph, degrees, k, generate_degree(degrees, p, distr)) + for k in a] + for p in ps] + r = [a] + r += l + with open(dataset + "_performance_p_" + str(distr) + ".txt", "w") as f: + f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) + + +def lp(graph, degrees, k): + reverse = {} + for user in sd_users(graph): + reverse[user] = [] + for user in graph: + for friend in graph[user]: + reverse[friend].append(user) + + prob = pulp.LpProblem("ads", pulp.LpMaximize) + x = pulp.LpVariable.dicts("x", graph.keys(), 0., 1.) + y = pulp.LpVariable.dicts("y", sd_users(graph), 0., 1.) + prob += pulp.lpSum([degrees[user] * x[user] for user in graph] + + [degrees[user] * y[user] for user in sd_users(graph)]) + for user in sd_users(graph): + prob += pulp.lpSum([x[u] for u in reverse[user]] + [-y[user]]) >= 0 + prob += pulp.lpSum([x[u] for u in graph] + [y[u] for u in reverse]) <= k + prob.solve(pulp.COIN_CMD()) + print "Status:", pulp.LpStatus[prob.status] + print "Value =", pulp.value(prob.objective) + + +def lp_perf(): + graph, degrees = build_graph("peet") + a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] + r = (a, + #[aps(graph, degrees, k) for k in a], + [lp(graph, degrees, k) for k in a]) + with open("lp_running_time.txt", "w") as f: + f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) + + +def lp_time(): + graph, degrees = build_graph("big") + sp = sample(graph.keys(), int(sys.argv[2])) + graph = {s: graph[s] for s in sp} + a = int(sys.argv[1]) + print len(list(sd_users(graph))), a + lp(graph, degrees, a) + + +def aps_time(): + graph, degrees = build_graph("big") + sp = sample(graph.keys(), int(sys.argv[2])) + graph = {s: graph[s] for s in sp} + a = int(sys.argv[1]) + print len(list(sd_users(graph))), a + aps(graph, degrees, a, p=0.9, sampl=True) + + +def lp_time_big(): + graph, degrees = build_graph("big") + graph_big = {} + for i in xrange(10): + for user in graph: + graph_big[user + str(i)] = graph[user] + degrees[user + str(i)] = degrees[user] + aps(graph_big, degrees, 500) + + +def hbo_likes(): + graph, degrees = build_graph("hbo") + like_file = op.join(DATA_DIR, "hbo_likes.txt") + likes = {} + for line in open(like_file): + values = line.strip().split("\t") + if "HBO" in values[1:]: + likes[values[0]] = True + a = [int(len(graph) * i) for i in np.arange(0, 1.1, 0.1)] + l = [aps(graph, degrees, k) for k in a] + for user in graph: + graph[user] = [friend for friend in graph[user] + if (friend in degrees and friend in likes)] + r = (a, + [im(graph, degrees, k) for k in a], + [aps(graph, degrees, k) for k in a], + l) + with open("hbo_likes_performance.txt", "w") as f: + f.write("\n".join("\t".join(map(str, k)) for k in zip(*r))) + + +def stats(): + for dataset in ["coachella"]: + graph, degrees = build_graph(dataset) + print dataset, len(graph) * 7, len(list(sd_users(graph))) * 7,\ + np.mean([degrees[u] for u in graph]),\ + np.mean([degrees[u] for u in sd_users(graph)]) + for dataset in ["nyt", "gap", "gp", "kiva"]: + graph, degrees = build_graph(dataset) + print dataset, len(graph) * 6, len(list(sd_users(graph))) * 6,\ + np.mean([degrees[u] for u in graph]),\ + np.mean([degrees[u] for u in sd_users(graph)]) + for dataset in ["google"]: + graph, degrees = build_graph(dataset) + print dataset, len(graph) * 5, len(list(sd_users(graph))) * 5,\ + np.mean([degrees[u] for u in graph]),\ + np.mean([degrees[u] for u in sd_users(graph)]) + for dataset in ["lp", "hbo", "lmpt"]: + graph, degrees = build_graph(dataset) + print dataset, len(graph) * 3, len(list(sd_users(graph))) * 3,\ + np.mean([degrees[u] for u in graph]),\ + np.mean([degrees[u] for u in sd_users(graph)]) + for dataset in ["peet"]: + graph, degrees = build_graph(dataset) + print dataset, len(graph) * 2, len(list(sd_users(graph))) * 2,\ + np.mean([degrees[u] for u in graph]),\ + np.mean([degrees[u] for u in sd_users(graph)]) + +if __name__ == "__main__": + #for dataset in SYNTH_DATASETS: + # compute_performance(dataset) + compute_performance_p("coachella", "deg") + #compute_performance("coachella") + #hbo_likes() + #lp_perf() + #lp_time() + #aps_time() + #stats() + #lp_time_big() + # _, degrees = build_graph2("coachella") + # with open("coachella_degrees.txt", "w") as fh: + # for deg in degrees.itervalues(): + # fh.write(str(deg) + "\n") diff --git a/facebook_analysis/seed.py b/facebook_analysis/seed.py new file mode 100644 index 0000000..7e2b851 --- /dev/null +++ b/facebook_analysis/seed.py @@ -0,0 +1,247 @@ +from analyze import sd_users, build_graph, DATASETS, SYNTH_DATASETS +import matplotlib.pyplot as plt +from matplotlib import rcParams, cm +from matplotlib.colors import Normalize +from matplotlib.pyplot import plot, legend, savefig, xlabel, ylabel,\ + hist, title, subplot, tight_layout, ticklabel_format, xlim, ylim +from mpl_toolkits.mplot3d import Axes3D +import numpy as np +import itertools + +mq = lambda x: x * 4 + + +def plot_degree_distributions(): + plt.figure(figsize=(7, 3)) + graph, degrees = build_graph("kiva") + fd_degrees = list(degrees[user] for user in graph) + sd_degrees = list(degrees[user] for user in sd_users(graph)) + n, bins, patches = plt.hist(fd_degrees, bins=50, cumulative=True, + label="Initial users", normed=True, + alpha=0.5, histtype="stepfilled") + n, bins, patches = plt.hist(sd_degrees, bins=50, cumulative=True, + histtype="stepfilled", normed=True, alpha=0.5, + label="Friends") + ylim(ymax=1.1) + plt.xlabel("Degree") + plt.ylabel("Probability") + plt.legend(loc="lower right") + plt.savefig("dist.pdf") + + +def plot_all_performances(): + plt.figure(figsize=(7, 14)) + for i, dataset in enumerate(DATASETS): + values = [map(float, line.strip().split("\t")) + for line in open(dataset + "_performance.txt")] + a, im, rd, rdf, aps = zip(*values) + a, im, rd, rdf, aps = [map(mq, l) for l in (a, im, rd, rdf, aps)] + a = np.arange(0, 1.001, 0.1) + ax = plt.subplot(5, 2, i + 1) + #ax.set_yscale("log") + plt.plot(a, im, label="Max deg.") + plt.plot(a, rd, label="Rand.") + plt.plot(a, rdf, label="Rand. friend") + plt.plot(a, aps, label="Adapt. Seeding") + plt.xlabel("Budget (fraction of the total number of users)") + plt.ylabel("Performance") + if dataset == "sw": + titl = "SmallWord" + if dataset == "coachella": + titl = "Conf. Model" + if dataset == "kk": + titl = "Kronecker" + if dataset == "b-a": + titl = "Barabasi-Albert" + plt.title(titl) + xlim(xmax=1.1) + plt.legend(loc="upper center", ncol=4, bbox_to_anchor=(0, 0, 1, 1.03), + bbox_transform=plt.gcf().transFigure) + plt.tight_layout() + plt.savefig("test2.pdf") + + +def compare_performance(fn): + plots = {} + plt.figure() + for dataset in DATASETS: + values = [map(float, line.strip().split("\t")) + for line in open(dataset + "_performance.txt")] + a, im, rd, rdf, aps = zip(*values) + plots[dataset] = [j * 1. / i for (j, i) in zip(aps, im)[1:]] + a = map(mq, a) + for dataset in DATASETS: + plt.plot(a[1:], plots[dataset], label=dataset) + xlim(xmax=550) + plt.xlabel("Budget") + plt.ylabel("Performance") + plt.legend(loc="lower right", ncol=2, fontsize="small") + plt.savefig(fn) + + +def compare_performance2(fn): + plots = {} + plt.figure() + for dataset in DATASETS: + values = [map(float, line.strip().split("\t")) + for line in open(dataset + "_performance.txt")] + a, im, rd, rdf, aps = zip(*values) + plots[dataset] = [j * 1. / i for (j, i) in zip(aps, im)[1:]] + a = map(mq, a) + a = map(int, a) + z = zip(*plots.itervalues()) + means = [np.mean(w) for w in z] + maxi = [np.max(w) for w in z] + mini = [np.min(w) for w in z] + ind = range(len(a[1:])) + width = 0.35 + plt.bar(ind, means, width, linewidth=0.1) + plt.errorbar([i + width / 2. for i in ind], means, [mini, maxi], elinewidth=1.2, fmt="none") + plt.xticks([i + width / 2. for i in ind], a[1:]) + plt.xlim(-width, len(ind) - 1 + 2 * width) + plt.xlabel("Budget") + plt.ylabel("Relative improvement") + plt.savefig(fn) + + +def compare_dist(): + fd, sd = [], [] + plt.figure(figsize=(5, 3)) + cm = iter(rcParams["axes.color_cycle"]) + for dataset in DATASETS: + graph, degrees = build_graph(dataset) + fd_degrees = list(degrees[user] for user in graph) + sd_degrees = list(degrees[user] for user in sd_users(graph)) + fd.append(np.mean(fd_degrees)) + sd.append(np.mean(sd_degrees)) + ind = range(len(DATASETS)) + width = 0.35 + plt.bar(ind, fd, width, label="Initial users", color=next(cm)) + plt.bar([i + width for i in ind], sd, width, label="Friends", + color=next(cm)) + plt.xlim(-width, len(ind) - 1 + 3 * width) + plt.xticks([i + width for i in ind], DATASETS) + plt.ylabel("Avg. degree") + plt.legend() + plt.savefig("para.pdf") + + +def plot_perf_prob(): + plt.figure() + with open("peet_performance_p.txt") as f: + values = [map(float, line.strip().split("\t")) for line in f] + values = zip(*values) + a = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + for i in [0, 1, 2, 3, 5, 9]: + plt.plot(values[0], values[i + 1], label="$p = " + str(a[i]) + "$") + plt.legend() + with open("peet_performance.txt") as f: + values = [map(float, line.strip().split("\t")) for line in f] + values = zip(*values) + plt.gca().set_yscale("log") + plt.xlabel("Budget") + plt.ylabel("Performance") + plt.plot(values[0], values[1], label="Max. degree") + plt.legend(loc="lower right", fontsize="small", ncol=2) + xlim(xmax=450) + plt.savefig("prob.pdf") + + +def plot_hbo_likes(): + plt.figure() + rcParams["font.size"] = 6 + with open("hbo_likes_performance.txt") as f: + values = [map(float, line.strip().split("\t")) for line in f] + a, im, aps, apso = zip(*values) + a = np.arange(0, 1.001, 0.1) + plt.gca().set_yscale("log") + #plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) + plt.plot(a, map(mq, im), label="Max. degr.") + plt.plot(a, map(mq, aps), label="Adapt. seed. (rest.)") + plt.plot(a, map(mq, apso), label="Adapt. seed.") + plt.xlabel("Budget") + plt.ylabel("Performance") + xlim(xmax=1.1) + plt.legend(loc="lower right") + plt.savefig("hbo_likes.pdf") + + +def plot_3d(): + for dist in ["beta", "gauss"]: + fig = plt.figure() + with open("coachella_performance_p_" + dist + ".txt") as f: + values = [map(float, line.strip().split("\t")) for line in f] + k = np.arange(0, 1.001, 0.1) + ps = np.arange(0.01, 0.99, 0.1) + x, y = np.meshgrid(k, ps) + perfs = [value[1:] for value in values] + perfs = zip(*perfs) + ax = fig.add_subplot(111, projection='3d') + ax.plot_wireframe(x, y, perfs, linewidth=0.1) + ticklabel_format(style='sci', axis='z', scilimits=(0, 0)) + xlabel("Budget (fraction of nodes)") + ylabel("Distribution mean") + ax.set_zlabel("Performance") + ax.invert_xaxis() + plt.savefig(dist + ".pdf") + plt.show() + + +def plot_time(): + plt.figure() + rcParams["font.size"] = 6 + a1 = np.loadtxt("time_aps_100.txt") + a2 = np.loadtxt("time_aps_500.txt") + lp1 = np.loadtxt("time_lp_100.txt") + lp2 = np.loadtxt("time_lp_500.txt") + subplot(2, 2, 1) + plot(a1[:, 0], a1[:, 1], "-", label="Comb.") + plot(lp1[:, 0], lp1[:, 1], "-", label="LP") + xlabel("n") + ylabel("time (s)") + xlim(0, 100000) + legend(loc="upper left") + ticklabel_format(style='sci', axis='x', scilimits=(0, 0)) + subplot(2, 2, 2) + plot(a1[:, 0], a1[:, 2], "-", label="Comb.") + plot(lp1[:, 0], lp1[:, 2], "-", label="LP") + ticklabel_format(style='sci', axis='x', scilimits=(0, 0)) + xlabel("n") + ylabel("\# cycles") + xlim(0, 100000) + legend(loc="upper left") + subplot(2, 2, 3) + plot(a2[:, 0], a2[:, 1], "-", label="Comb.") + plot(lp2[:, 0], lp2[:, 1], "-", label="LP") + ticklabel_format(style='sci', axis='x', scilimits=(0, 0)) + xlabel("n") + ylabel("time (s)") + xlim(0, 100000) + legend(loc="upper left") + subplot(2, 2, 4) + plot(a2[:, 0], a2[:, 2], "-", label="Comb.") + plot(lp2[:, 0], lp2[:, 2], "-", label="LP") + ticklabel_format(style='sci', axis='x', scilimits=(0, 0)) + xlabel("n") + ylabel("\# cycles") + xlim(0, 100000) + legend(loc="upper left") + tight_layout(h_pad=-0.5) + savefig("time.pdf") + + +if __name__ == "__main__": + SYNTH_DATASETS = ["b-a", "kk", "sw", "coachella"] + DATASETS = SYNTH_DATASETS + plot_all_performances() + #plot_3d() + #plot_hbo_likes() + #compare_performance() + #plot_perf_prob() + #compare_dist() + #plot_time() + #plot_degree_distributions() + #for style in plt.style.available: + # plt.style.use(style) + # compare_performance("performance_" + style + ".pdf") + #compare_performance2("comp4_" + ".pdf") diff --git a/facebook_analysis/setup.py b/facebook_analysis/setup.py new file mode 100644 index 0000000..043e734 --- /dev/null +++ b/facebook_analysis/setup.py @@ -0,0 +1,4 @@ +from distutils.core import setup +from Cython.Build import cythonize + +setup(ext_modules=cythonize("ads.pyx")) diff --git a/facebook_scraping/Makefile b/facebook_scraping/Makefile new file mode 100644 index 0000000..fced427 --- /dev/null +++ b/facebook_scraping/Makefile @@ -0,0 +1,49 @@ +SHELL=/bin/bash +HOSTS=servers.txt +USER=ubuntu +OPTIONS=-x -"F ./ssh_config" +FOPTIONS=$(OPTIONS) -h <(cut -f1 $(HOSTS)) + +.PHONY: deploy servers + +servers_simple: + ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt + +servers: + ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt + paste <(cut -f2 servers.txt) <(cut -f28,29 survey8a.txt) > credentials.txt + rsync credentials.txt horel.org:kdd/ + +servers2: + ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt + paste <(cut -f2 servers.txt) fb_accounts2.txt > credentials.txt + rsync credentials.txt horel.org:kdd/ + +uptime: + pssh $(FOPTIONS) 'uptime' + +running: + pssh -i $(FOPTIONS) 'pgrep -f "celery worker"' + +deploy: + cd client; tar -czf facebook.tar.gz requirements.txt tasks.py + cd client; rsync facebook.tar.gz Makefile horel.org:public_html/facebook + pssh -i $(FOPTIONS) 'rm -rf tasks.py tasks.pyc kdd/; curl http://thibaut.horel.org/facebook/Makefile > Makefile; make boostrap' + +run: + pssh -i $(FOPTIONS) 'make run' + +stop: + pssh -i $(FOPTIONS) "make stop; killall chromedriver; killall chromium-browser; killall Xvfb; rm -f tasks.pyc" + +restart: + pssh $(FOPTIONS) "make restart" + +test: + pssh -i $(FOPTIONS) 'rm -f tasks.pyc; grep "replace" tasks.py' + +deploy_server: + rsync run.py run2.py server.py credentials.txt horel.org:kdd/ + + + diff --git a/facebook_scraping/client/Makefile b/facebook_scraping/client/Makefile new file mode 100644 index 0000000..3a07802 --- /dev/null +++ b/facebook_scraping/client/Makefile @@ -0,0 +1,15 @@ +all: boostrap run + +boostrap: + curl http://thibaut.horel.org/facebook/facebook.tar.gz > facebook.tar.gz + tar -xzf facebook.tar.gz + +run: + celery -A tasks --concurrency=2 worker --detach -l info + +stop: + rm -f celeryd.pid + pgrep -f "celery worker" | xargs kill -9 + +restart: + pgrep -f "celery worker" | xargs kill -HUP diff --git a/facebook_scraping/client/__init__.py b/facebook_scraping/client/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/facebook_scraping/client/requirements.txt b/facebook_scraping/client/requirements.txt new file mode 100644 index 0000000..cba9c1f --- /dev/null +++ b/facebook_scraping/client/requirements.txt @@ -0,0 +1,4 @@ +beautifulsoup4 +celery +selenium +xvfbwrapper diff --git a/facebook_scraping/client/tasks.py b/facebook_scraping/client/tasks.py new file mode 100644 index 0000000..4557968 --- /dev/null +++ b/facebook_scraping/client/tasks.py @@ -0,0 +1,243 @@ +from xvfbwrapper import Xvfb +from selenium import webdriver +from selenium.common.exceptions import ElementNotVisibleException,\ + NoSuchElementException, StaleElementReferenceException, WebDriverException +from time import sleep +from bs4 import BeautifulSoup, NavigableString +from celery import Celery, Task +from urllib2 import urlopen +import socket + +app = Celery('tasks', broker='amqp://guest@horel.org//') +app.conf.CELERY_RESULT_BACKEND = 'rpc' +app.conf.CELERY_ENABLE_UTC = True +app.conf.CELERY_ACKS_LATE = True +drivers = [None] +ip = socket.gethostbyname(socket.gethostname()) + + +def strip(url): + if url.endswith("/friends"): + return url[:-8] + else: + return url.split("&")[0] + + +def normalize(url): + if "profile.php" in url: + basename = url.split("&")[0] + fname = basename.split("=")[-1] + getname = basename + "&sk=friends" + else: + basename = url.split("?")[0] + fname = basename.split("/")[-1] + getname = basename + "/friends" + return basename, fname, getname + + +class ListFollowers(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + try: + self.driver.get(url) + except WebDriverException: + return {"friends": [], "for": url, "orig": ip} + + while True: + for _ in xrange(5): + try: + footer = self.driver.find_element_by_class_name("_359") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + break + + try: + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + footer.click() + except StaleElementReferenceException: + sleep(0.1) + except WebDriverException: + for _ in xrange(5): + try: + footer.click() + except (WebDriverException, StaleElementReferenceException): + sleep(0.1) + else: + break + else: + break + + for _ in xrange(5): + try: + div = self.driver.find_element_by_class_name("_30f") + except NoSuchElementException: + sleep(0.1) + else: + break + else: + try: + self.driver.find_element_by_id("loginbutton") + except NoSuchElementException: + return {"friends": [], "for": url, "orig": ip} + else: + return {"friends": None, "for": url, "orig": ip} + + soup = BeautifulSoup(div.get_attribute("outerHTML")) + return {"friends": [li.a["href"] + for li in soup.findAll("li", class_="_698")], + "for": url, + "orig": ip} + + +class NumFollowers(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + try: + self.driver.get(url) + except WebDriverException: + return {"nfriends": 0, "for": url, "orig": ip} + + for i in xrange(20): + try: + box = self.driver.find_element_by_class_name("_1f8g") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + try: + self.driver.find_element_by_id("loginbutton") + except NoSuchElementException: + return {"nfriends": 0, "for": url, "orig": ip} + else: + return {"nfriends": None, "for": url, "orig": ip} + + soup = BeautifulSoup(box.get_attribute("outerHTML")) + a = soup.find("a", class_="uiLinkSubtle") + try: + n_friends = int(a.string.replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) + except ValueError: + n_friends = a.string + print n_friends + return {"nfriends": n_friends, + "for": url, + "orig": ip} + + +class Likes(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + try: + self.driver.get(url) + except WebDriverException: + return {"likes": [], "for": url, "orig": ip} + + while True: + for _ in xrange(5): + try: + footer = self.driver.find_element_by_class_name("_359") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + break + + try: + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + footer.click() + except StaleElementReferenceException: + sleep(0.1) + except WebDriverException: + for _ in xrange(5): + try: + footer.click() + except (WebDriverException, StaleElementReferenceException): + sleep(0.1) + else: + break + else: + break + + for _ in xrange(5): + try: + div = self.driver.find_element_by_class_name("_30f") + except NoSuchElementException: + sleep(0.1) + else: + break + else: + try: + self.driver.find_element_by_id("loginbutton") + except NoSuchElementException: + return {"likes": "", "for": url, "orig": ip} + else: + return {"likes": None, "for": url, "orig": ip} + + def clean(a): + for child in a.children: + if type(child) == NavigableString: + return child + else: + return "" + return "" + + soup = BeautifulSoup(div.get_attribute("outerHTML")) + likes = [clean(li.find("a", class_="_gx7")) + for li in soup.findAll("li", class_="_5rz")] + return {"likes": u"\t".join(likes).encode("utf8"), + "for": url, + "orig": ip} + +if __name__ == "__main__": + nf = Likes() + with open("toto.txt", "w") as f: + f.write( u"\t".join(nf.run("https://www.facebook.com/grvgaba29" + "/video_tv_show_favorite")["likes"]).encode("utf8") + "\n") diff --git a/facebook_scraping/limits.py b/facebook_scraping/limits.py new file mode 100644 index 0000000..8ce38cf --- /dev/null +++ b/facebook_scraping/limits.py @@ -0,0 +1,6 @@ +from celery import Celery +app = Celery('tasks', broker='amqp://guest@horel.org//') +print app.control.rate_limit('tasks.NumFollowers', '4/m', destination=['celery@ip-172-31-42-158'], reply=True) +print app.control.rate_limit('tasks.ListFollowers', '4/m', destination=['celery@ip-172-31-42-158'], reply=True) +print app.control.rate_limit('tasks.NumFollowers', '1/h', destination=['celery@ip-172-31-42-159'], reply=True) +print app.control.rate_limit('tasks.ListFollowers', '1/h', destination=['celery@ip-172-31-42-159'], reply=True) diff --git a/facebook_scraping/mturk.py b/facebook_scraping/mturk.py new file mode 100644 index 0000000..f6322da --- /dev/null +++ b/facebook_scraping/mturk.py @@ -0,0 +1,16 @@ +import csv +import os.path as op +from glob import glob + +for fname in glob("*.csv"): + with open(fname) as f: + reader = csv.reader(f) + oname, _ = op.splitext(fname) + oname = oname + ".txt" + with open(oname, "w") as of: + for i, row in enumerate(reader): + if i == 0: + continue + if row[-1] == "": + row = row[:-1] + of.write("\t".join(row) + "\n") diff --git a/facebook_scraping/run.py b/facebook_scraping/run.py new file mode 100644 index 0000000..94eb1a4 --- /dev/null +++ b/facebook_scraping/run.py @@ -0,0 +1,91 @@ +from tasks import NumFollowers, ListFollowers, normalize, strip +from bs4 import BeautifulSoup +from celery.result import ResultSet +import os.path as op +from datetime import datetime +import sys + +nf = NumFollowers() +lf = ListFollowers() + +users = {} +try: + with open(sys.argv[1]) as f: + for line in f: + values = line.strip().split() + users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) +except IOError: + pass + +output = open(sys.argv[1], "a") +bad = open("bad.txt", "a") + + +def add_user(user, degree): + users[user] = degree + output.write(user + " " + str(degree) + "\n") + + +def call_back(tid, value): + print datetime.now().isoformat() + " " + str(value) + if "nfriends" in value: + if value["nfriends"] is None: + bad.write(value["orig"] + "\n") + bad.flush() + return + basename, fname, getname = normalize(value["for"]) + n_friends = int(str(value["nfriends"]).replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) + add_user(fname, n_friends) + return + +if sys.argv[4] == "True": + todo = ResultSet([]) + soup = BeautifulSoup(open(sys.argv[2])) + links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] + chunk = [] + for link in links: + basename, finame, getname = normalize(link) + if op.isfile("facebook/" + finame): + with open("facebook/" + finame) as f: + for line in f: + basename, fname, getname = normalize(line.strip()) + if fname not in users: + print finame + todo.add(nf.delay(basename)) + todo.join_native(callback=call_back) +todo = [] + + +def call_back_fd(tid, value): + print datetime.now().isoformat() + " " + str(value) + if value["friends"] is None: + bad.write(value["orig"] + "\n") + bad.flush() + return + basename, fname, getname = normalize(strip(value["for"])) + add_user(fname, len(value["friends"])) + with open("facebook/" + fname, "w") as f: + for friend in value["friends"]: + basename, fname, getname = normalize(friend) + f.write(basename + "\n") + if fname not in users: + todo.append(basename) + +soup = BeautifulSoup(open(sys.argv[2])) +links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] +chunk = [] +for link in links: + basename, fname, getname = normalize(link) + if not op.isfile("facebook/" + fname): + chunk.append(getname) + if len(chunk) == int(sys.argv[3]): + todofd = ResultSet([]) + for name in chunk: + todofd.add(lf.delay(name)) + chunk = [] + todofd.join_native(callback=call_back_fd) + todos = ResultSet([]) + for name in todo: + todos.add(nf.delay(name)) + todo = [] + todos.join_native(callback=call_back) diff --git a/facebook_scraping/run2.py b/facebook_scraping/run2.py new file mode 100644 index 0000000..a52a37b --- /dev/null +++ b/facebook_scraping/run2.py @@ -0,0 +1,90 @@ +from tasks import NumFollowers, ListFollowers, normalize, Likes +from bs4 import BeautifulSoup +from celery.result import ResultSet +import os.path as op +from datetime import datetime +import sys + +nf = NumFollowers() +lf = ListFollowers() +likes = Likes() + +users = {} +try: + with open(sys.argv[1]) as f: + for line in f: + values = line.strip().split() + users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) +except IOError: + pass + +users_likes = {} +try: + with open(sys.argv[3]) as f: + for line in f: + values = line.strip().split() + users_likes[values[0]] = True +except IOError: + pass + +output = open(sys.argv[3], "a") +bad = open("bad.txt", "a") + + +def add_user(user, degree): + users[user] = degree + output.write(user + " " + str(degree) + "\n") + + +def add_user2(user, likes): + output.write(user + "\t" + likes + "\n") + + +def strip2(url): + l = "/video_tv_show_favorite" + if url.endswith(l): + return url[:-len(l)] + else: + return url.split("&")[0] + + +def call_back(tid, value): + print datetime.now().isoformat() + " " + str(value) + if "likes" in value: + if value["likes"] is None: + bad.write(value["orig"] + "\n") + bad.flush() + return + basename, fname, getname = normalize(strip2(value["for"])) + add_user2(fname, value["likes"]) + return + + +def normalize2(url): + if "profile.php" in url: + basename = url.split("&")[0] + fname = basename.split("=")[-1] + getname = basename + "&sk=video_tv_show_favorite" + else: + basename = url.split("?")[0] + fname = basename.split("/")[-1] + getname = basename + "/video_tv_show_favorite" + return basename, fname, getname + +soup = BeautifulSoup(open(sys.argv[2])) +links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] +chunk = [] +for link in links: + basename, finame, getname = normalize(link) + if op.isfile("facebook/" + finame): + with open("facebook/" + finame) as f: + for line in f: + basename, fname, getname = normalize2(line.strip()) + if fname in users and users[fname] > 0 and fname not in users_likes: + chunk.append(getname) + if len(chunk) == 100: + todo = ResultSet([]) + for name in chunk: + todo.add(likes.delay(name)) + chunk = [] + todo.join_native(callback=call_back) diff --git a/facebook_scraping/seed.py b/facebook_scraping/seed.py new file mode 100644 index 0000000..932c16b --- /dev/null +++ b/facebook_scraping/seed.py @@ -0,0 +1,7 @@ +import sys +from bs4 import BeautifulSoup + +soup = BeautifulSoup(open(sys.argv[1])) +links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] +for link in links: + print link diff --git a/facebook_scraping/server.py b/facebook_scraping/server.py new file mode 100644 index 0000000..6425c7b --- /dev/null +++ b/facebook_scraping/server.py @@ -0,0 +1,16 @@ +from bottle import route, run, request + + +@route('/') +def index(): + d = {} + with open("credentials.txt") as f: + for line in f: + values = line.strip().split() + d[values[0]] = values[1:3] + + ip = request.environ.get('REMOTE_ADDR') + return " ".join(d[ip]) + + +run(host='0.0.0.0', port=8080) diff --git a/fb_accounts.txt b/fb_accounts.txt deleted file mode 100644 index fbd093a..0000000 --- a/fb_accounts.txt +++ /dev/null @@ -1,2 +0,0 @@ -127.0.0.1 thibaut.horel@normalesup.org Dlmatc06 -thibaut.horel+1@gmail.com Dlmatc06 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index cba9c1f..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -beautifulsoup4 -celery -selenium -xvfbwrapper diff --git a/run.py b/run.py deleted file mode 100644 index ffee06a..0000000 --- a/run.py +++ /dev/null @@ -1,73 +0,0 @@ -from tasks import NumFollowers, ListFollowers, normalize -from bs4 import BeautifulSoup -from celery.result import ResultSet -import os.path as op -from glob import glob - -nf = NumFollowers() -lf = ListFollowers() -rset = ResultSet([]) - -users = {} -try: - with open("all_users.txt") as f: - for line in f: - values = line.strip().split() - users[values[0]] = int(values[1]) -except IOError: - pass - -output = open("all_users.txt", "a") - - -def strip(url): - if url.endswith("/friends"): - return url[:-8] - else: - return url.split("&")[0] - - -def add_user(user, degree): - print user, degree - users[user] = degree - output.write(user + " " + str(degree) + "\n") - output.flush() - - -def call_back(tid, value): - if "friends" in value: - return - - if "nfriends" in value: - basename, fname, getname = normalize(value["for"]) - add_user(fname, value["nfriends"]) - return - -todo = ResultSet([]) -for finame in glob("facebook/*"): - with open(finame) as f: - for line in f: - basename, fname, getname = normalize(line.strip()) - if fname not in users: - print finame - todo.add(nf.delay(basename)) -todo.join_native(callback=call_back) - -soup = BeautifulSoup(open("seed.txt")) -links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] -for link in links[:100]: - basename, fname, getname = normalize(link) - if not op.isfile("facebook/" + fname): - result = lf.delay(getname) - value = result.get() - basename, fname, getname = normalize(strip(value["for"])) - add_user(fname, len(value["friends"])) - todo = ResultSet([]) - with open("facebook/" + fname, "w") as f: - for friend in value["friends"]: - basename, fname, getname = normalize(friend) - f.write(basename + "\n") - if fname not in users: - todo.add(nf.delay(basename)) - print ("facebook/" + fname) - todo.join_native(callback=call_back) diff --git a/seed.txt b/seed.txt deleted file mode 100644 index 6fa0d6a..0000000 --- a/seed.txt +++ /dev/null @@ -1 +0,0 @@ -
diff --git a/server.py b/server.py deleted file mode 100644 index c002256..0000000 --- a/server.py +++ /dev/null @@ -1,16 +0,0 @@ -from bottle import route, run, request - - -@route('/') -def index(): - d = {} - with open("fb_accounts.txt") as f: - for line in f: - values = line.strip().split() - d[values[0]] = values[1:] - - ip = request.environ.get('REMOTE_ADDR') - return " ".join(d[ip]) - - -run(host='0.0.0.0', port=8080) diff --git a/tasks.py b/tasks.py deleted file mode 100644 index cb0c3aa..0000000 --- a/tasks.py +++ /dev/null @@ -1,111 +0,0 @@ -from xvfbwrapper import Xvfb -from selenium import webdriver -from selenium.common.exceptions import ElementNotVisibleException,\ - NoSuchElementException, StaleElementReferenceException -from time import sleep -from bs4 import BeautifulSoup -from celery import Celery, Task -from urllib2 import urlopen - -app = Celery('tasks', broker='amqp://guest@horel.org//') -app.conf.CELERY_RESULT_BACKEND = 'rpc' -app.conf.CELERY_ENABLE_UTC = True -drivers = [None] - - -def normalize(url): - if "profile.php" in url: - basename = url.split("&")[0] - fname = basename.split("=")[-1] - getname = basename + "&sk=friends" - else: - basename = url.split("?")[0] - fname = basename.split("/")[-1] - getname = basename + "/friends" - return basename, fname, getname - - -class ListFollowers(Task): - - @property - def driver(self): - if drivers[0] is None: - uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() - vdisplay = Xvfb() - vdisplay.start() - driver = webdriver.Chrome() - driver.get("https://facebook.com") - driver.find_element_by_id("email").send_keys(uname) - elem = driver.find_element_by_id("pass") - elem.send_keys(passwd) - elem.submit() - drivers[0] = driver - return drivers[0] - - def run(self, url): - self.driver.get(url) - while True: - for i in xrange(5): - try: - footer = self.driver.find_element_by_class_name("_359") - except (NoSuchElementException, ElementNotVisibleException): - sleep(0.1) - else: - break - else: - break - - try: - footer.click() - except StaleElementReferenceException: - sleep(0.1) - - for i in xrange(5): - try: - div = self.driver.find_element_by_class_name("_30f") - except NoSuchElementException: - sleep(0.1) - else: - break - else: - return {"friends": [], "for": url} - - soup = BeautifulSoup(div.get_attribute("outerHTML")) - return {"friends": [li.a["href"] - for li in soup.findAll("li", class_="_698")], - "for": url} - - -class NumFollowers(Task): - - @property - def driver(self): - if drivers[0] is None: - uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() - vdisplay = Xvfb() - vdisplay.start() - driver = webdriver.Chrome() - driver.get("https://facebook.com") - driver.find_element_by_id("email").send_keys(uname) - elem = driver.find_element_by_id("pass") - elem.send_keys(passwd) - elem.submit() - drivers[0] = driver - return drivers[0] - - def run(self, url): - self.driver.get(url) - for i in xrange(5): - try: - box = self.driver.find_element_by_class_name("_1f8g") - except (NoSuchElementException, ElementNotVisibleException): - sleep(0.1) - else: - break - else: - return {"nfriends": 0, "for": url} - - soup = BeautifulSoup(box.get_attribute("outerHTML")) - a = soup.find("a", class_="uiLinkSubtle") - return {"nfriends": int(a.string.replace(",", "")), - "for": url} diff --git a/twitter/api_accounts.txt b/twitter/api_accounts.txt index 836b10d..cd0dea6 100644 --- a/twitter/api_accounts.txt +++ b/twitter/api_accounts.txt @@ -1,4 +1,3 @@ -thibaut.horel@gmail.com Dlmatc06 GT3ILinlqcuChZY2ueOb1Q 9Jx9WGyfNea35X2kYCAN8hh9WkZl6wD7b4yXkY 2291723059-dvaHVGA50FYgDtxxZZQoBU0MQYysdaYOFIyOeLa 70GdBOKCIQWliX1hllfgmek2vEvrnKBqm0bBfApbP38TO zaran.krleza+1@gmail.com i6rkXWj78 Fle9xRwFyXO3SV7zR7KDg 0rAzjUo6yyx0DtHR6EvIQPenynJKmLKgPvyGRqj4w 2304251221-ztXyr6HFBOuDbPiWqFQT3wWAQfW6iEw7RoQXrwW 6xf5T89H4wneiiSskuRtL8GWHhK0g84CNmPdCeAOiXCP8 zaran.krleza+6@gmail.com och9phoM6qu HIIXtDoVIbc54IFoMzRmAQ E57OPRvxIOH5CS2ROSBMs0jS0UY5lCMsxKEk1mBws 2315047123-0skfirkKYl78eo66TFc3g6pkqzuVWZLGYIQRLny m7kyeesr726sSyF8UTQCFYssphbhqPeVftbmC67uwvrrf zaran.krleza+7@gmail.com ohr8ID7xoo DhjatHIduiUWDfwCPy13Ig 9QYIrGugvMXeMSqe67t7ylIPC8XXfDlvRAM2mwB6Rs 2315047440-RSva8oO8Mz0KL4npovzOCsg3WEbY7JWgbXR5BeJ Oy8iIhQrsVH9D1eQ97sQPlTrExcKDtarLQEqpcXDO1fMl diff --git a/twitter/dispatcher.py b/twitter/dispatcher.py index 56fb9f7..2bba1c3 100644 --- a/twitter/dispatcher.py +++ b/twitter/dispatcher.py @@ -51,8 +51,6 @@ class Dispatcher: def add_user(self, user_id, user_name, followers_count): self.users[user_id] = user_name - if int(followers_count) >= 5000: - return if (not pa.isfile(pa.join("data", "users", user_id + ".txt")) and user_id not in self.current_followers): self.followers_queue[user_id] = (user_name, followers_count) diff --git a/twitter/scraper.py b/twitter/scraper.py index 49b116a..e912782 100644 --- a/twitter/scraper.py +++ b/twitter/scraper.py @@ -92,5 +92,4 @@ class Driver: if __name__ == "__main__": credentials = open("scraping_accounts.txt").readline().strip().split() driver = Driver(*credentials[:2]) - # driver.get_followers("23302126", "flipper509") - print driver.get_profile(100, "thibauthorel") + driver.get_followers("23302126", "flipper509") diff --git a/twitter/stream.py b/twitter/stream.py index 71cf615..4fe38c4 100644 --- a/twitter/stream.py +++ b/twitter/stream.py @@ -24,9 +24,7 @@ class Listener(StreamListener): def get_concepts(self, entities): hashtags = (hashtag["text"].lower() for hashtag in entities["hashtags"]) - users = (user["screen_name"].lower() - for user in entities["user_mentions"]) - return set(chain(hashtags, users)) + return set(hashtags) def on_status(self, tweet): concepts = self.get_concepts(tweet.entities) @@ -35,6 +33,7 @@ class Listener(StreamListener): str(tweet.user.friends_count), str(tweet.user.verified), tweet.created_at.isoformat()]) + print str(dict(tweet)) for concept in concepts: if concept in self.fhandlers: fh = self.fhandlers[concept] @@ -47,7 +46,7 @@ def process(filename, cred_file): concepts = [line.strip() for line in f] credentials = open(cred_file).readline().strip().split() os.chdir("data") - entities = [("#" + concept, "@" + concept) for concept in concepts] + entities = [("#" + concept) for concept in concepts] track = chain.from_iterable(entities) auth = OAuthHandler(*credentials[2:4]) auth.set_access_token(*credentials[4:]) -- cgit v1.2.3-70-g09d2