diff options
Diffstat (limited to 'run.py')
| -rw-r--r-- | run.py | 73 |
1 files changed, 73 insertions, 0 deletions
@@ -0,0 +1,73 @@ +from tasks import NumFollowers, ListFollowers, normalize +from bs4 import BeautifulSoup +from celery.result import ResultSet +import os.path as op +from glob import glob + +nf = NumFollowers() +lf = ListFollowers() +rset = ResultSet([]) + +users = {} +try: + with open("all_users.txt") as f: + for line in f: + values = line.strip().split() + users[values[0]] = int(values[1]) +except IOError: + pass + +output = open("all_users.txt", "a") + + +def strip(url): + if url.endswith("/friends"): + return url[:-8] + else: + return url.split("&")[0] + + +def add_user(user, degree): + print user, degree + users[user] = degree + output.write(user + " " + str(degree) + "\n") + output.flush() + + +def call_back(tid, value): + if "friends" in value: + return + + if "nfriends" in value: + basename, fname, getname = normalize(value["for"]) + add_user(fname, value["nfriends"]) + return + +todo = ResultSet([]) +for finame in glob("facebook/*"): + with open(finame) as f: + for line in f: + basename, fname, getname = normalize(line.strip()) + if fname not in users: + print finame + todo.add(nf.delay(basename)) +todo.join_native(callback=call_back) + +soup = BeautifulSoup(open("seed.txt")) +links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] +for link in links[:100]: + basename, fname, getname = normalize(link) + if not op.isfile("facebook/" + fname): + result = lf.delay(getname) + value = result.get() + basename, fname, getname = normalize(strip(value["for"])) + add_user(fname, len(value["friends"])) + todo = ResultSet([]) + with open("facebook/" + fname, "w") as f: + for friend in value["friends"]: + basename, fname, getname = normalize(friend) + f.write(basename + "\n") + if fname not in users: + todo.add(nf.delay(basename)) + print ("facebook/" + fname) + todo.join_native(callback=call_back) |
