diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2014-10-24 12:16:51 -0400 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2014-10-24 12:16:51 -0400 |
| commit | ece1d828d53d6123fcecb5ea8bf9b126d1728ccc (patch) | |
| tree | b669382d0e5f1234556d1aeb7fa919891510b24d /facebook_scraping/run2.py | |
| parent | 7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (diff) | |
| download | fast-seeding-ece1d828d53d6123fcecb5ea8bf9b126d1728ccc.tar.gz | |
Add code
Diffstat (limited to 'facebook_scraping/run2.py')
| -rw-r--r-- | facebook_scraping/run2.py | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/facebook_scraping/run2.py b/facebook_scraping/run2.py new file mode 100644 index 0000000..a52a37b --- /dev/null +++ b/facebook_scraping/run2.py @@ -0,0 +1,90 @@ +from tasks import NumFollowers, ListFollowers, normalize, Likes +from bs4 import BeautifulSoup +from celery.result import ResultSet +import os.path as op +from datetime import datetime +import sys + +nf = NumFollowers() +lf = ListFollowers() +likes = Likes() + +users = {} +try: + with open(sys.argv[1]) as f: + for line in f: + values = line.strip().split() + users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) +except IOError: + pass + +users_likes = {} +try: + with open(sys.argv[3]) as f: + for line in f: + values = line.strip().split() + users_likes[values[0]] = True +except IOError: + pass + +output = open(sys.argv[3], "a") +bad = open("bad.txt", "a") + + +def add_user(user, degree): + users[user] = degree + output.write(user + " " + str(degree) + "\n") + + +def add_user2(user, likes): + output.write(user + "\t" + likes + "\n") + + +def strip2(url): + l = "/video_tv_show_favorite" + if url.endswith(l): + return url[:-len(l)] + else: + return url.split("&")[0] + + +def call_back(tid, value): + print datetime.now().isoformat() + " " + str(value) + if "likes" in value: + if value["likes"] is None: + bad.write(value["orig"] + "\n") + bad.flush() + return + basename, fname, getname = normalize(strip2(value["for"])) + add_user2(fname, value["likes"]) + return + + +def normalize2(url): + if "profile.php" in url: + basename = url.split("&")[0] + fname = basename.split("=")[-1] + getname = basename + "&sk=video_tv_show_favorite" + else: + basename = url.split("?")[0] + fname = basename.split("/")[-1] + getname = basename + "/video_tv_show_favorite" + return basename, fname, getname + +soup = BeautifulSoup(open(sys.argv[2])) +links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] +chunk = [] +for link in links: + basename, finame, getname = normalize(link) + if op.isfile("facebook/" + finame): + with open("facebook/" + finame) as f: + for line in f: + basename, fname, getname = normalize2(line.strip()) + if fname in users and users[fname] > 0 and fname not in users_likes: + chunk.append(getname) + if len(chunk) == 100: + todo = ResultSet([]) + for name in chunk: + todo.add(likes.delay(name)) + chunk = [] + todo.join_native(callback=call_back) |
