From 7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Sun, 2 Feb 2014 16:53:22 -0500 Subject: Facebook scraping --- tasks.py | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 tasks.py (limited to 'tasks.py') diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..cb0c3aa --- /dev/null +++ b/tasks.py @@ -0,0 +1,111 @@ +from xvfbwrapper import Xvfb +from selenium import webdriver +from selenium.common.exceptions import ElementNotVisibleException,\ + NoSuchElementException, StaleElementReferenceException +from time import sleep +from bs4 import BeautifulSoup +from celery import Celery, Task +from urllib2 import urlopen + +app = Celery('tasks', broker='amqp://guest@horel.org//') +app.conf.CELERY_RESULT_BACKEND = 'rpc' +app.conf.CELERY_ENABLE_UTC = True +drivers = [None] + + +def normalize(url): + if "profile.php" in url: + basename = url.split("&")[0] + fname = basename.split("=")[-1] + getname = basename + "&sk=friends" + else: + basename = url.split("?")[0] + fname = basename.split("/")[-1] + getname = basename + "/friends" + return basename, fname, getname + + +class ListFollowers(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + self.driver.get(url) + while True: + for i in xrange(5): + try: + footer = self.driver.find_element_by_class_name("_359") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + break + + try: + footer.click() + except StaleElementReferenceException: + sleep(0.1) + + for i in xrange(5): + try: + div = self.driver.find_element_by_class_name("_30f") + except NoSuchElementException: + sleep(0.1) + else: + break + else: + return {"friends": [], "for": url} + + soup = BeautifulSoup(div.get_attribute("outerHTML")) + return {"friends": [li.a["href"] + for li in soup.findAll("li", class_="_698")], + "for": url} + + +class NumFollowers(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + self.driver.get(url) + for i in xrange(5): + try: + box = self.driver.find_element_by_class_name("_1f8g") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + return {"nfriends": 0, "for": url} + + soup = BeautifulSoup(box.get_attribute("outerHTML")) + a = soup.find("a", class_="uiLinkSubtle") + return {"nfriends": int(a.string.replace(",", "")), + "for": url} -- cgit v1.2.3-70-g09d2