from tasks import NumFollowers, ListFollowers, normalize, Likes from bs4 import BeautifulSoup from celery.result import ResultSet import os.path as op from datetime import datetime import sys nf = NumFollowers() lf = ListFollowers() likes = Likes() users = {} try: with open(sys.argv[1]) as f: for line in f: values = line.strip().split() users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) except IOError: pass users_likes = {} try: with open(sys.argv[3]) as f: for line in f: values = line.strip().split() users_likes[values[0]] = True except IOError: pass output = open(sys.argv[3], "a") bad = open("bad.txt", "a") def add_user(user, degree): users[user] = degree output.write(user + " " + str(degree) + "\n") def add_user2(user, likes): output.write(user + "\t" + likes + "\n") def strip2(url): l = "/video_tv_show_favorite" if url.endswith(l): return url[:-len(l)] else: return url.split("&")[0] def call_back(tid, value): print datetime.now().isoformat() + " " + str(value) if "likes" in value: if value["likes"] is None: bad.write(value["orig"] + "\n") bad.flush() return basename, fname, getname = normalize(strip2(value["for"])) add_user2(fname, value["likes"]) return def normalize2(url): if "profile.php" in url: basename = url.split("&")[0] fname = basename.split("=")[-1] getname = basename + "&sk=video_tv_show_favorite" else: basename = url.split("?")[0] fname = basename.split("/")[-1] getname = basename + "/video_tv_show_favorite" return basename, fname, getname soup = BeautifulSoup(open(sys.argv[2])) links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] chunk = [] for link in links: basename, finame, getname = normalize(link) if op.isfile("facebook/" + finame): with open("facebook/" + finame) as f: for line in f: basename, fname, getname = normalize2(line.strip()) if fname in users and users[fname] > 0 and fname not in users_likes: chunk.append(getname) if len(chunk) == 100: todo = ResultSet([]) for name in chunk: todo.add(likes.delay(name)) chunk = [] todo.join_native(callback=call_back)