summaryrefslogtreecommitdiffstats
path: root/facebook_scraping/client/tasks.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2014-10-24 12:16:51 -0400
committerThibaut Horel <thibaut.horel@gmail.com>2014-10-24 12:16:51 -0400
commitece1d828d53d6123fcecb5ea8bf9b126d1728ccc (patch)
treeb669382d0e5f1234556d1aeb7fa919891510b24d /facebook_scraping/client/tasks.py
parent7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (diff)
downloadfast-seeding-ece1d828d53d6123fcecb5ea8bf9b126d1728ccc.tar.gz
Add code
Diffstat (limited to 'facebook_scraping/client/tasks.py')
-rw-r--r--facebook_scraping/client/tasks.py243
1 files changed, 243 insertions, 0 deletions
diff --git a/facebook_scraping/client/tasks.py b/facebook_scraping/client/tasks.py
new file mode 100644
index 0000000..4557968
--- /dev/null
+++ b/facebook_scraping/client/tasks.py
@@ -0,0 +1,243 @@
+from xvfbwrapper import Xvfb
+from selenium import webdriver
+from selenium.common.exceptions import ElementNotVisibleException,\
+ NoSuchElementException, StaleElementReferenceException, WebDriverException
+from time import sleep
+from bs4 import BeautifulSoup, NavigableString
+from celery import Celery, Task
+from urllib2 import urlopen
+import socket
+
+app = Celery('tasks', broker='amqp://guest@horel.org//')
+app.conf.CELERY_RESULT_BACKEND = 'rpc'
+app.conf.CELERY_ENABLE_UTC = True
+app.conf.CELERY_ACKS_LATE = True
+drivers = [None]
+ip = socket.gethostbyname(socket.gethostname())
+
+
+def strip(url):
+ if url.endswith("/friends"):
+ return url[:-8]
+ else:
+ return url.split("&")[0]
+
+
+def normalize(url):
+ if "profile.php" in url:
+ basename = url.split("&")[0]
+ fname = basename.split("=")[-1]
+ getname = basename + "&sk=friends"
+ else:
+ basename = url.split("?")[0]
+ fname = basename.split("/")[-1]
+ getname = basename + "/friends"
+ return basename, fname, getname
+
+
+class ListFollowers(Task):
+
+ @property
+ def driver(self):
+ if drivers[0] is None:
+ uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+ vdisplay = Xvfb()
+ vdisplay.start()
+ driver = webdriver.Chrome()
+ driver.get("https://facebook.com")
+ driver.find_element_by_id("email").send_keys(uname)
+ elem = driver.find_element_by_id("pass")
+ elem.send_keys(passwd)
+ elem.submit()
+ drivers[0] = driver
+ return drivers[0]
+
+ def run(self, url):
+ try:
+ self.driver.get(url)
+ except WebDriverException:
+ return {"friends": [], "for": url, "orig": ip}
+
+ while True:
+ for _ in xrange(5):
+ try:
+ footer = self.driver.find_element_by_class_name("_359")
+ except (NoSuchElementException, ElementNotVisibleException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ try:
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+ footer.click()
+ except StaleElementReferenceException:
+ sleep(0.1)
+ except WebDriverException:
+ for _ in xrange(5):
+ try:
+ footer.click()
+ except (WebDriverException, StaleElementReferenceException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ for _ in xrange(5):
+ try:
+ div = self.driver.find_element_by_class_name("_30f")
+ except NoSuchElementException:
+ sleep(0.1)
+ else:
+ break
+ else:
+ try:
+ self.driver.find_element_by_id("loginbutton")
+ except NoSuchElementException:
+ return {"friends": [], "for": url, "orig": ip}
+ else:
+ return {"friends": None, "for": url, "orig": ip}
+
+ soup = BeautifulSoup(div.get_attribute("outerHTML"))
+ return {"friends": [li.a["href"]
+ for li in soup.findAll("li", class_="_698")],
+ "for": url,
+ "orig": ip}
+
+
+class NumFollowers(Task):
+
+ @property
+ def driver(self):
+ if drivers[0] is None:
+ uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+ vdisplay = Xvfb()
+ vdisplay.start()
+ driver = webdriver.Chrome()
+ driver.get("https://facebook.com")
+ driver.find_element_by_id("email").send_keys(uname)
+ elem = driver.find_element_by_id("pass")
+ elem.send_keys(passwd)
+ elem.submit()
+ drivers[0] = driver
+ return drivers[0]
+
+ def run(self, url):
+ try:
+ self.driver.get(url)
+ except WebDriverException:
+ return {"nfriends": 0, "for": url, "orig": ip}
+
+ for i in xrange(20):
+ try:
+ box = self.driver.find_element_by_class_name("_1f8g")
+ except (NoSuchElementException, ElementNotVisibleException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ try:
+ self.driver.find_element_by_id("loginbutton")
+ except NoSuchElementException:
+ return {"nfriends": 0, "for": url, "orig": ip}
+ else:
+ return {"nfriends": None, "for": url, "orig": ip}
+
+ soup = BeautifulSoup(box.get_attribute("outerHTML"))
+ a = soup.find("a", class_="uiLinkSubtle")
+ try:
+ n_friends = int(a.string.replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+ except ValueError:
+ n_friends = a.string
+ print n_friends
+ return {"nfriends": n_friends,
+ "for": url,
+ "orig": ip}
+
+
+class Likes(Task):
+
+ @property
+ def driver(self):
+ if drivers[0] is None:
+ uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+ vdisplay = Xvfb()
+ vdisplay.start()
+ driver = webdriver.Chrome()
+ driver.get("https://facebook.com")
+ driver.find_element_by_id("email").send_keys(uname)
+ elem = driver.find_element_by_id("pass")
+ elem.send_keys(passwd)
+ elem.submit()
+ drivers[0] = driver
+ return drivers[0]
+
+ def run(self, url):
+ try:
+ self.driver.get(url)
+ except WebDriverException:
+ return {"likes": [], "for": url, "orig": ip}
+
+ while True:
+ for _ in xrange(5):
+ try:
+ footer = self.driver.find_element_by_class_name("_359")
+ except (NoSuchElementException, ElementNotVisibleException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ try:
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+ footer.click()
+ except StaleElementReferenceException:
+ sleep(0.1)
+ except WebDriverException:
+ for _ in xrange(5):
+ try:
+ footer.click()
+ except (WebDriverException, StaleElementReferenceException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ for _ in xrange(5):
+ try:
+ div = self.driver.find_element_by_class_name("_30f")
+ except NoSuchElementException:
+ sleep(0.1)
+ else:
+ break
+ else:
+ try:
+ self.driver.find_element_by_id("loginbutton")
+ except NoSuchElementException:
+ return {"likes": "", "for": url, "orig": ip}
+ else:
+ return {"likes": None, "for": url, "orig": ip}
+
+ def clean(a):
+ for child in a.children:
+ if type(child) == NavigableString:
+ return child
+ else:
+ return ""
+ return ""
+
+ soup = BeautifulSoup(div.get_attribute("outerHTML"))
+ likes = [clean(li.find("a", class_="_gx7"))
+ for li in soup.findAll("li", class_="_5rz")]
+ return {"likes": u"\t".join(likes).encode("utf8"),
+ "for": url,
+ "orig": ip}
+
+if __name__ == "__main__":
+ nf = Likes()
+ with open("toto.txt", "w") as f:
+ f.write( u"\t".join(nf.run("https://www.facebook.com/grvgaba29" + "/video_tv_show_favorite")["likes"]).encode("utf8") + "\n")