summaryrefslogtreecommitdiffstats
path: root/facebook_scraping/run2.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2014-10-24 12:16:51 -0400
committerThibaut Horel <thibaut.horel@gmail.com>2014-10-24 12:16:51 -0400
commitece1d828d53d6123fcecb5ea8bf9b126d1728ccc (patch)
treeb669382d0e5f1234556d1aeb7fa919891510b24d /facebook_scraping/run2.py
parent7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (diff)
downloadfast-seeding-ece1d828d53d6123fcecb5ea8bf9b126d1728ccc.tar.gz
Add code
Diffstat (limited to 'facebook_scraping/run2.py')
-rw-r--r--facebook_scraping/run2.py90
1 files changed, 90 insertions, 0 deletions
diff --git a/facebook_scraping/run2.py b/facebook_scraping/run2.py
new file mode 100644
index 0000000..a52a37b
--- /dev/null
+++ b/facebook_scraping/run2.py
@@ -0,0 +1,90 @@
+from tasks import NumFollowers, ListFollowers, normalize, Likes
+from bs4 import BeautifulSoup
+from celery.result import ResultSet
+import os.path as op
+from datetime import datetime
+import sys
+
+nf = NumFollowers()
+lf = ListFollowers()
+likes = Likes()
+
+users = {}
+try:
+ with open(sys.argv[1]) as f:
+ for line in f:
+ values = line.strip().split()
+ users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+except IOError:
+ pass
+
+users_likes = {}
+try:
+ with open(sys.argv[3]) as f:
+ for line in f:
+ values = line.strip().split()
+ users_likes[values[0]] = True
+except IOError:
+ pass
+
+output = open(sys.argv[3], "a")
+bad = open("bad.txt", "a")
+
+
+def add_user(user, degree):
+ users[user] = degree
+ output.write(user + " " + str(degree) + "\n")
+
+
+def add_user2(user, likes):
+ output.write(user + "\t" + likes + "\n")
+
+
+def strip2(url):
+ l = "/video_tv_show_favorite"
+ if url.endswith(l):
+ return url[:-len(l)]
+ else:
+ return url.split("&")[0]
+
+
+def call_back(tid, value):
+ print datetime.now().isoformat() + " " + str(value)
+ if "likes" in value:
+ if value["likes"] is None:
+ bad.write(value["orig"] + "\n")
+ bad.flush()
+ return
+ basename, fname, getname = normalize(strip2(value["for"]))
+ add_user2(fname, value["likes"])
+ return
+
+
+def normalize2(url):
+ if "profile.php" in url:
+ basename = url.split("&")[0]
+ fname = basename.split("=")[-1]
+ getname = basename + "&sk=video_tv_show_favorite"
+ else:
+ basename = url.split("?")[0]
+ fname = basename.split("/")[-1]
+ getname = basename + "/video_tv_show_favorite"
+ return basename, fname, getname
+
+soup = BeautifulSoup(open(sys.argv[2]))
+links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+chunk = []
+for link in links:
+ basename, finame, getname = normalize(link)
+ if op.isfile("facebook/" + finame):
+ with open("facebook/" + finame) as f:
+ for line in f:
+ basename, fname, getname = normalize2(line.strip())
+ if fname in users and users[fname] > 0 and fname not in users_likes:
+ chunk.append(getname)
+ if len(chunk) == 100:
+ todo = ResultSet([])
+ for name in chunk:
+ todo.add(likes.delay(name))
+ chunk = []
+ todo.join_native(callback=call_back)