From ece1d828d53d6123fcecb5ea8bf9b126d1728ccc Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Fri, 24 Oct 2014 12:16:51 -0400 Subject: Add code --- facebook_scraping/client/Makefile | 15 ++ facebook_scraping/client/__init__.py | 0 facebook_scraping/client/requirements.txt | 4 + facebook_scraping/client/tasks.py | 243 ++++++++++++++++++++++++++++++ 4 files changed, 262 insertions(+) create mode 100644 facebook_scraping/client/Makefile create mode 100644 facebook_scraping/client/__init__.py create mode 100644 facebook_scraping/client/requirements.txt create mode 100644 facebook_scraping/client/tasks.py (limited to 'facebook_scraping/client') diff --git a/facebook_scraping/client/Makefile b/facebook_scraping/client/Makefile new file mode 100644 index 0000000..3a07802 --- /dev/null +++ b/facebook_scraping/client/Makefile @@ -0,0 +1,15 @@ +all: boostrap run + +boostrap: + curl http://thibaut.horel.org/facebook/facebook.tar.gz > facebook.tar.gz + tar -xzf facebook.tar.gz + +run: + celery -A tasks --concurrency=2 worker --detach -l info + +stop: + rm -f celeryd.pid + pgrep -f "celery worker" | xargs kill -9 + +restart: + pgrep -f "celery worker" | xargs kill -HUP diff --git a/facebook_scraping/client/__init__.py b/facebook_scraping/client/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/facebook_scraping/client/requirements.txt b/facebook_scraping/client/requirements.txt new file mode 100644 index 0000000..cba9c1f --- /dev/null +++ b/facebook_scraping/client/requirements.txt @@ -0,0 +1,4 @@ +beautifulsoup4 +celery +selenium +xvfbwrapper diff --git a/facebook_scraping/client/tasks.py b/facebook_scraping/client/tasks.py new file mode 100644 index 0000000..4557968 --- /dev/null +++ b/facebook_scraping/client/tasks.py @@ -0,0 +1,243 @@ +from xvfbwrapper import Xvfb +from selenium import webdriver +from selenium.common.exceptions import ElementNotVisibleException,\ + NoSuchElementException, StaleElementReferenceException, WebDriverException +from time import sleep +from bs4 import BeautifulSoup, NavigableString +from celery import Celery, Task +from urllib2 import urlopen +import socket + +app = Celery('tasks', broker='amqp://guest@horel.org//') +app.conf.CELERY_RESULT_BACKEND = 'rpc' +app.conf.CELERY_ENABLE_UTC = True +app.conf.CELERY_ACKS_LATE = True +drivers = [None] +ip = socket.gethostbyname(socket.gethostname()) + + +def strip(url): + if url.endswith("/friends"): + return url[:-8] + else: + return url.split("&")[0] + + +def normalize(url): + if "profile.php" in url: + basename = url.split("&")[0] + fname = basename.split("=")[-1] + getname = basename + "&sk=friends" + else: + basename = url.split("?")[0] + fname = basename.split("/")[-1] + getname = basename + "/friends" + return basename, fname, getname + + +class ListFollowers(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + try: + self.driver.get(url) + except WebDriverException: + return {"friends": [], "for": url, "orig": ip} + + while True: + for _ in xrange(5): + try: + footer = self.driver.find_element_by_class_name("_359") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + break + + try: + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + footer.click() + except StaleElementReferenceException: + sleep(0.1) + except WebDriverException: + for _ in xrange(5): + try: + footer.click() + except (WebDriverException, StaleElementReferenceException): + sleep(0.1) + else: + break + else: + break + + for _ in xrange(5): + try: + div = self.driver.find_element_by_class_name("_30f") + except NoSuchElementException: + sleep(0.1) + else: + break + else: + try: + self.driver.find_element_by_id("loginbutton") + except NoSuchElementException: + return {"friends": [], "for": url, "orig": ip} + else: + return {"friends": None, "for": url, "orig": ip} + + soup = BeautifulSoup(div.get_attribute("outerHTML")) + return {"friends": [li.a["href"] + for li in soup.findAll("li", class_="_698")], + "for": url, + "orig": ip} + + +class NumFollowers(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + try: + self.driver.get(url) + except WebDriverException: + return {"nfriends": 0, "for": url, "orig": ip} + + for i in xrange(20): + try: + box = self.driver.find_element_by_class_name("_1f8g") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + try: + self.driver.find_element_by_id("loginbutton") + except NoSuchElementException: + return {"nfriends": 0, "for": url, "orig": ip} + else: + return {"nfriends": None, "for": url, "orig": ip} + + soup = BeautifulSoup(box.get_attribute("outerHTML")) + a = soup.find("a", class_="uiLinkSubtle") + try: + n_friends = int(a.string.replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) + except ValueError: + n_friends = a.string + print n_friends + return {"nfriends": n_friends, + "for": url, + "orig": ip} + + +class Likes(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + try: + self.driver.get(url) + except WebDriverException: + return {"likes": [], "for": url, "orig": ip} + + while True: + for _ in xrange(5): + try: + footer = self.driver.find_element_by_class_name("_359") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + break + + try: + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + footer.click() + except StaleElementReferenceException: + sleep(0.1) + except WebDriverException: + for _ in xrange(5): + try: + footer.click() + except (WebDriverException, StaleElementReferenceException): + sleep(0.1) + else: + break + else: + break + + for _ in xrange(5): + try: + div = self.driver.find_element_by_class_name("_30f") + except NoSuchElementException: + sleep(0.1) + else: + break + else: + try: + self.driver.find_element_by_id("loginbutton") + except NoSuchElementException: + return {"likes": "", "for": url, "orig": ip} + else: + return {"likes": None, "for": url, "orig": ip} + + def clean(a): + for child in a.children: + if type(child) == NavigableString: + return child + else: + return "" + return "" + + soup = BeautifulSoup(div.get_attribute("outerHTML")) + likes = [clean(li.find("a", class_="_gx7")) + for li in soup.findAll("li", class_="_5rz")] + return {"likes": u"\t".join(likes).encode("utf8"), + "for": url, + "orig": ip} + +if __name__ == "__main__": + nf = Likes() + with open("toto.txt", "w") as f: + f.write( u"\t".join(nf.run("https://www.facebook.com/grvgaba29" + "/video_tv_show_favorite")["likes"]).encode("utf8") + "\n") -- cgit v1.2.3-70-g09d2