diff options
Diffstat (limited to 'facebook_scraping/run.py')
| -rw-r--r-- | facebook_scraping/run.py | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/facebook_scraping/run.py b/facebook_scraping/run.py new file mode 100644 index 0000000..94eb1a4 --- /dev/null +++ b/facebook_scraping/run.py @@ -0,0 +1,91 @@ +from tasks import NumFollowers, ListFollowers, normalize, strip +from bs4 import BeautifulSoup +from celery.result import ResultSet +import os.path as op +from datetime import datetime +import sys + +nf = NumFollowers() +lf = ListFollowers() + +users = {} +try: + with open(sys.argv[1]) as f: + for line in f: + values = line.strip().split() + users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) +except IOError: + pass + +output = open(sys.argv[1], "a") +bad = open("bad.txt", "a") + + +def add_user(user, degree): + users[user] = degree + output.write(user + " " + str(degree) + "\n") + + +def call_back(tid, value): + print datetime.now().isoformat() + " " + str(value) + if "nfriends" in value: + if value["nfriends"] is None: + bad.write(value["orig"] + "\n") + bad.flush() + return + basename, fname, getname = normalize(value["for"]) + n_friends = int(str(value["nfriends"]).replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) + add_user(fname, n_friends) + return + +if sys.argv[4] == "True": + todo = ResultSet([]) + soup = BeautifulSoup(open(sys.argv[2])) + links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] + chunk = [] + for link in links: + basename, finame, getname = normalize(link) + if op.isfile("facebook/" + finame): + with open("facebook/" + finame) as f: + for line in f: + basename, fname, getname = normalize(line.strip()) + if fname not in users: + print finame + todo.add(nf.delay(basename)) + todo.join_native(callback=call_back) +todo = [] + + +def call_back_fd(tid, value): + print datetime.now().isoformat() + " " + str(value) + if value["friends"] is None: + bad.write(value["orig"] + "\n") + bad.flush() + return + basename, fname, getname = normalize(strip(value["for"])) + add_user(fname, len(value["friends"])) + with open("facebook/" + fname, "w") as f: + for friend in value["friends"]: + basename, fname, getname = normalize(friend) + f.write(basename + "\n") + if fname not in users: + todo.append(basename) + +soup = BeautifulSoup(open(sys.argv[2])) +links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] +chunk = [] +for link in links: + basename, fname, getname = normalize(link) + if not op.isfile("facebook/" + fname): + chunk.append(getname) + if len(chunk) == int(sys.argv[3]): + todofd = ResultSet([]) + for name in chunk: + todofd.add(lf.delay(name)) + chunk = [] + todofd.join_native(callback=call_back_fd) + todos = ResultSet([]) + for name in todo: + todos.add(nf.delay(name)) + todo = [] + todos.join_native(callback=call_back) |
