summaryrefslogtreecommitdiffstats
path: root/scraper.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2014-02-02 16:53:22 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2014-02-02 16:53:22 -0500
commit7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (patch)
tree323d6a9a4423b51fbebb37c115fddeab1c7a9641 /scraper.py
parenta0e95b0843d4e366e4b979685f7c821954afebc6 (diff)
downloadfast-seeding-7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b.tar.gz
Facebook scraping
Diffstat (limited to 'scraper.py')
-rw-r--r--scraper.py86
1 files changed, 0 insertions, 86 deletions
diff --git a/scraper.py b/scraper.py
deleted file mode 100644
index ee7dd8f..0000000
--- a/scraper.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from selenium import webdriver
-from selenium.webdriver.support.wait import WebDriverWait
-from selenium.common.exceptions import TimeoutException,\
- ElementNotVisibleException, NoSuchElementException
-from bs4 import BeautifulSoup, Tag
-
-import os.path
-
-
-class Driver:
-
- def __init__(self, username, password):
- self.driver = webdriver.PhantomJS()
- self.username = username
- self.password = password
- self.__connect()
-
- def __ajax_complete(self):
- return 0 == self.driver.execute_script("return jQuery.active")
-
- def __connect(self):
- driver = self.driver
- driver.get("http://twitter.com")
- driver.find_element_by_id("signin-email").send_keys(self.username)
- elem = driver.find_element_by_id("signin-password")
- elem.send_keys(self.password)
- elem.submit()
-
- def __get_followers(self, username):
- driver = self.driver
- driver.get("https://twitter.com/{0}/followers".format(username))
- try:
- footer = driver.find_element_by_class_name("timeline-end")
- except NoSuchElementException:
- return
-
- while True:
- try:
- if "has-more-items" not in footer.get_attribute("class"):
- break
- footer.click()
- try:
- WebDriverWait(driver,
- 5).until(lambda x: self.__ajax_complete(),
- "Timeout waiting for "
- "ajax to return")
- except TimeoutException:
- break
- except (NoSuchElementException, ElementNotVisibleException):
- break
-
- try:
- fws = driver.find_element_by_id("stream-items-id")
- except NoSuchElementException:
- return
-
- soup = BeautifulSoup(fws.get_attribute("outerHTML"))
- for follower in soup.ol:
- if type(follower) == Tag:
- div = follower.div
- user_id = div["data-user-id"]
- screen_name = div["data-screen-name"]
- yield user_id, screen_name
-
- def get_followers(self, user_id, username):
- filename = os.path.join("data", "users", user_id + ".txt")
- if os.path.isfile(filename):
- return filename
- l = list(self.__get_followers(username))
- with open(filename, "w") as f:
- for (fid, fname) in l:
- f.write(fid + " " + fname + "\n")
- return filename
-
- def process(self, filename):
- with open(filename) as f:
- for line in f:
- values = line.strip().split()
- self.get_followers(*values[:2])
-
-
-if __name__ == "__main__":
- credentials = open("scraping_accounts.txt").readline().strip().split()
- driver = Driver(*credentials[:2])
- # driver.get_followers("23302126", "flipper509")
- print driver.get_profile(100, "thibauthorel")