summaryrefslogtreecommitdiffstats
path: root/facebook_scraping/client
diff options
context:
space:
mode:
Diffstat (limited to 'facebook_scraping/client')
-rw-r--r--facebook_scraping/client/Makefile15
-rw-r--r--facebook_scraping/client/__init__.py0
-rw-r--r--facebook_scraping/client/requirements.txt4
-rw-r--r--facebook_scraping/client/tasks.py243
4 files changed, 262 insertions, 0 deletions
diff --git a/facebook_scraping/client/Makefile b/facebook_scraping/client/Makefile
new file mode 100644
index 0000000..3a07802
--- /dev/null
+++ b/facebook_scraping/client/Makefile
@@ -0,0 +1,15 @@
+all: boostrap run
+
+boostrap:
+ curl http://thibaut.horel.org/facebook/facebook.tar.gz > facebook.tar.gz
+ tar -xzf facebook.tar.gz
+
+run:
+ celery -A tasks --concurrency=2 worker --detach -l info
+
+stop:
+ rm -f celeryd.pid
+ pgrep -f "celery worker" | xargs kill -9
+
+restart:
+ pgrep -f "celery worker" | xargs kill -HUP
diff --git a/facebook_scraping/client/__init__.py b/facebook_scraping/client/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/facebook_scraping/client/__init__.py
diff --git a/facebook_scraping/client/requirements.txt b/facebook_scraping/client/requirements.txt
new file mode 100644
index 0000000..cba9c1f
--- /dev/null
+++ b/facebook_scraping/client/requirements.txt
@@ -0,0 +1,4 @@
+beautifulsoup4
+celery
+selenium
+xvfbwrapper
diff --git a/facebook_scraping/client/tasks.py b/facebook_scraping/client/tasks.py
new file mode 100644
index 0000000..4557968
--- /dev/null
+++ b/facebook_scraping/client/tasks.py
@@ -0,0 +1,243 @@
+from xvfbwrapper import Xvfb
+from selenium import webdriver
+from selenium.common.exceptions import ElementNotVisibleException,\
+ NoSuchElementException, StaleElementReferenceException, WebDriverException
+from time import sleep
+from bs4 import BeautifulSoup, NavigableString
+from celery import Celery, Task
+from urllib2 import urlopen
+import socket
+
+app = Celery('tasks', broker='amqp://guest@horel.org//')
+app.conf.CELERY_RESULT_BACKEND = 'rpc'
+app.conf.CELERY_ENABLE_UTC = True
+app.conf.CELERY_ACKS_LATE = True
+drivers = [None]
+ip = socket.gethostbyname(socket.gethostname())
+
+
+def strip(url):
+ if url.endswith("/friends"):
+ return url[:-8]
+ else:
+ return url.split("&")[0]
+
+
+def normalize(url):
+ if "profile.php" in url:
+ basename = url.split("&")[0]
+ fname = basename.split("=")[-1]
+ getname = basename + "&sk=friends"
+ else:
+ basename = url.split("?")[0]
+ fname = basename.split("/")[-1]
+ getname = basename + "/friends"
+ return basename, fname, getname
+
+
+class ListFollowers(Task):
+
+ @property
+ def driver(self):
+ if drivers[0] is None:
+ uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+ vdisplay = Xvfb()
+ vdisplay.start()
+ driver = webdriver.Chrome()
+ driver.get("https://facebook.com")
+ driver.find_element_by_id("email").send_keys(uname)
+ elem = driver.find_element_by_id("pass")
+ elem.send_keys(passwd)
+ elem.submit()
+ drivers[0] = driver
+ return drivers[0]
+
+ def run(self, url):
+ try:
+ self.driver.get(url)
+ except WebDriverException:
+ return {"friends": [], "for": url, "orig": ip}
+
+ while True:
+ for _ in xrange(5):
+ try:
+ footer = self.driver.find_element_by_class_name("_359")
+ except (NoSuchElementException, ElementNotVisibleException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ try:
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+ footer.click()
+ except StaleElementReferenceException:
+ sleep(0.1)
+ except WebDriverException:
+ for _ in xrange(5):
+ try:
+ footer.click()
+ except (WebDriverException, StaleElementReferenceException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ for _ in xrange(5):
+ try:
+ div = self.driver.find_element_by_class_name("_30f")
+ except NoSuchElementException:
+ sleep(0.1)
+ else:
+ break
+ else:
+ try:
+ self.driver.find_element_by_id("loginbutton")
+ except NoSuchElementException:
+ return {"friends": [], "for": url, "orig": ip}
+ else:
+ return {"friends": None, "for": url, "orig": ip}
+
+ soup = BeautifulSoup(div.get_attribute("outerHTML"))
+ return {"friends": [li.a["href"]
+ for li in soup.findAll("li", class_="_698")],
+ "for": url,
+ "orig": ip}
+
+
+class NumFollowers(Task):
+
+ @property
+ def driver(self):
+ if drivers[0] is None:
+ uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+ vdisplay = Xvfb()
+ vdisplay.start()
+ driver = webdriver.Chrome()
+ driver.get("https://facebook.com")
+ driver.find_element_by_id("email").send_keys(uname)
+ elem = driver.find_element_by_id("pass")
+ elem.send_keys(passwd)
+ elem.submit()
+ drivers[0] = driver
+ return drivers[0]
+
+ def run(self, url):
+ try:
+ self.driver.get(url)
+ except WebDriverException:
+ return {"nfriends": 0, "for": url, "orig": ip}
+
+ for i in xrange(20):
+ try:
+ box = self.driver.find_element_by_class_name("_1f8g")
+ except (NoSuchElementException, ElementNotVisibleException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ try:
+ self.driver.find_element_by_id("loginbutton")
+ except NoSuchElementException:
+ return {"nfriends": 0, "for": url, "orig": ip}
+ else:
+ return {"nfriends": None, "for": url, "orig": ip}
+
+ soup = BeautifulSoup(box.get_attribute("outerHTML"))
+ a = soup.find("a", class_="uiLinkSubtle")
+ try:
+ n_friends = int(a.string.replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+ except ValueError:
+ n_friends = a.string
+ print n_friends
+ return {"nfriends": n_friends,
+ "for": url,
+ "orig": ip}
+
+
+class Likes(Task):
+
+ @property
+ def driver(self):
+ if drivers[0] is None:
+ uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+ vdisplay = Xvfb()
+ vdisplay.start()
+ driver = webdriver.Chrome()
+ driver.get("https://facebook.com")
+ driver.find_element_by_id("email").send_keys(uname)
+ elem = driver.find_element_by_id("pass")
+ elem.send_keys(passwd)
+ elem.submit()
+ drivers[0] = driver
+ return drivers[0]
+
+ def run(self, url):
+ try:
+ self.driver.get(url)
+ except WebDriverException:
+ return {"likes": [], "for": url, "orig": ip}
+
+ while True:
+ for _ in xrange(5):
+ try:
+ footer = self.driver.find_element_by_class_name("_359")
+ except (NoSuchElementException, ElementNotVisibleException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ try:
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+ footer.click()
+ except StaleElementReferenceException:
+ sleep(0.1)
+ except WebDriverException:
+ for _ in xrange(5):
+ try:
+ footer.click()
+ except (WebDriverException, StaleElementReferenceException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ for _ in xrange(5):
+ try:
+ div = self.driver.find_element_by_class_name("_30f")
+ except NoSuchElementException:
+ sleep(0.1)
+ else:
+ break
+ else:
+ try:
+ self.driver.find_element_by_id("loginbutton")
+ except NoSuchElementException:
+ return {"likes": "", "for": url, "orig": ip}
+ else:
+ return {"likes": None, "for": url, "orig": ip}
+
+ def clean(a):
+ for child in a.children:
+ if type(child) == NavigableString:
+ return child
+ else:
+ return ""
+ return ""
+
+ soup = BeautifulSoup(div.get_attribute("outerHTML"))
+ likes = [clean(li.find("a", class_="_gx7"))
+ for li in soup.findAll("li", class_="_5rz")]
+ return {"likes": u"\t".join(likes).encode("utf8"),
+ "for": url,
+ "orig": ip}
+
+if __name__ == "__main__":
+ nf = Likes()
+ with open("toto.txt", "w") as f:
+ f.write( u"\t".join(nf.run("https://www.facebook.com/grvgaba29" + "/video_tv_show_favorite")["likes"]).encode("utf8") + "\n")