summaryrefslogtreecommitdiffstats
path: root/facebook_scraping/run.py
diff options
context:
space:
mode:
Diffstat (limited to 'facebook_scraping/run.py')
-rw-r--r--facebook_scraping/run.py91
1 files changed, 91 insertions, 0 deletions
diff --git a/facebook_scraping/run.py b/facebook_scraping/run.py
new file mode 100644
index 0000000..94eb1a4
--- /dev/null
+++ b/facebook_scraping/run.py
@@ -0,0 +1,91 @@
+from tasks import NumFollowers, ListFollowers, normalize, strip
+from bs4 import BeautifulSoup
+from celery.result import ResultSet
+import os.path as op
+from datetime import datetime
+import sys
+
+nf = NumFollowers()
+lf = ListFollowers()
+
+users = {}
+try:
+ with open(sys.argv[1]) as f:
+ for line in f:
+ values = line.strip().split()
+ users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+except IOError:
+ pass
+
+output = open(sys.argv[1], "a")
+bad = open("bad.txt", "a")
+
+
+def add_user(user, degree):
+ users[user] = degree
+ output.write(user + " " + str(degree) + "\n")
+
+
+def call_back(tid, value):
+ print datetime.now().isoformat() + " " + str(value)
+ if "nfriends" in value:
+ if value["nfriends"] is None:
+ bad.write(value["orig"] + "\n")
+ bad.flush()
+ return
+ basename, fname, getname = normalize(value["for"])
+ n_friends = int(str(value["nfriends"]).replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+ add_user(fname, n_friends)
+ return
+
+if sys.argv[4] == "True":
+ todo = ResultSet([])
+ soup = BeautifulSoup(open(sys.argv[2]))
+ links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+ chunk = []
+ for link in links:
+ basename, finame, getname = normalize(link)
+ if op.isfile("facebook/" + finame):
+ with open("facebook/" + finame) as f:
+ for line in f:
+ basename, fname, getname = normalize(line.strip())
+ if fname not in users:
+ print finame
+ todo.add(nf.delay(basename))
+ todo.join_native(callback=call_back)
+todo = []
+
+
+def call_back_fd(tid, value):
+ print datetime.now().isoformat() + " " + str(value)
+ if value["friends"] is None:
+ bad.write(value["orig"] + "\n")
+ bad.flush()
+ return
+ basename, fname, getname = normalize(strip(value["for"]))
+ add_user(fname, len(value["friends"]))
+ with open("facebook/" + fname, "w") as f:
+ for friend in value["friends"]:
+ basename, fname, getname = normalize(friend)
+ f.write(basename + "\n")
+ if fname not in users:
+ todo.append(basename)
+
+soup = BeautifulSoup(open(sys.argv[2]))
+links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+chunk = []
+for link in links:
+ basename, fname, getname = normalize(link)
+ if not op.isfile("facebook/" + fname):
+ chunk.append(getname)
+ if len(chunk) == int(sys.argv[3]):
+ todofd = ResultSet([])
+ for name in chunk:
+ todofd.add(lf.delay(name))
+ chunk = []
+ todofd.join_native(callback=call_back_fd)
+ todos = ResultSet([])
+ for name in todo:
+ todos.add(nf.delay(name))
+ todo = []
+ todos.join_native(callback=call_back)