from tasks import NumFollowers, ListFollowers, normalize, strip from bs4 import BeautifulSoup from celery.result import ResultSet import os.path as op from datetime import datetime import sys nf = NumFollowers() lf = ListFollowers() users = {} try: with open(sys.argv[1]) as f: for line in f: values = line.strip().split() users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) except IOError: pass output = open(sys.argv[1], "a") bad = open("bad.txt", "a") def add_user(user, degree): users[user] = degree output.write(user + " " + str(degree) + "\n") def call_back(tid, value): print datetime.now().isoformat() + " " + str(value) if "nfriends" in value: if value["nfriends"] is None: bad.write(value["orig"] + "\n") bad.flush() return basename, fname, getname = normalize(value["for"]) n_friends = int(str(value["nfriends"]).replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) add_user(fname, n_friends) return if sys.argv[4] == "True": todo = ResultSet([]) soup = BeautifulSoup(open(sys.argv[2])) links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] chunk = [] for link in links: basename, finame, getname = normalize(link) if op.isfile("facebook/" + finame): with open("facebook/" + finame) as f: for line in f: basename, fname, getname = normalize(line.strip()) if fname not in users: print finame todo.add(nf.delay(basename)) todo.join_native(callback=call_back) todo = [] def call_back_fd(tid, value): print datetime.now().isoformat() + " " + str(value) if value["friends"] is None: bad.write(value["orig"] + "\n") bad.flush() return basename, fname, getname = normalize(strip(value["for"])) add_user(fname, len(value["friends"])) with open("facebook/" + fname, "w") as f: for friend in value["friends"]: basename, fname, getname = normalize(friend) f.write(basename + "\n") if fname not in users: todo.append(basename) soup = BeautifulSoup(open(sys.argv[2])) links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] chunk = [] for link in links: basename, fname, getname = normalize(link) if not op.isfile("facebook/" + fname): chunk.append(getname) if len(chunk) == int(sys.argv[3]): todofd = ResultSet([]) for name in chunk: todofd.add(lf.delay(name)) chunk = [] todofd.join_native(callback=call_back_fd) todos = ResultSet([]) for name in todo: todos.add(nf.delay(name)) todo = [] todos.join_native(callback=call_back)