diff options
Diffstat (limited to 'twitter/scraper.py')
| -rw-r--r-- | twitter/scraper.py | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/twitter/scraper.py b/twitter/scraper.py new file mode 100644 index 0000000..49b116a --- /dev/null +++ b/twitter/scraper.py @@ -0,0 +1,96 @@ +from selenium import webdriver +from selenium.webdriver.support.wait import WebDriverWait +from selenium.common.exceptions import TimeoutException,\ + ElementNotVisibleException, NoSuchElementException +from bs4 import BeautifulSoup, Tag + +import os.path +from urllib2 import URLError +from time import sleep + + +class Driver: + + def __init__(self, username, password): + self.driver = webdriver.PhantomJS() + self.username = username + self.password = password + self.__connect() + + def __ajax_complete(self): + return 0 == self.driver.execute_script("return jQuery.active") + + def __connect(self): + driver = self.driver + driver.get("http://twitter.com") + driver.find_element_by_id("signin-email").send_keys(self.username) + elem = driver.find_element_by_id("signin-password") + elem.send_keys(self.password) + elem.submit() + + def __get_followers(self, username): + driver = self.driver + try: + driver.get("https://twitter.com/{0}/followers".format(username)) + except URLError: + sleep(1) + try: + driver.get("https://twitter.com/{0}/followers".format(username)) + except URLError: + return + + try: + footer = driver.find_element_by_class_name("timeline-end") + except NoSuchElementException: + return + + while True: + try: + if "has-more-items" not in footer.get_attribute("class"): + break + footer.click() + try: + WebDriverWait(driver, + 5).until(lambda x: self.__ajax_complete(), + "Timeout waiting for " + "ajax to return") + except TimeoutException: + break + except (NoSuchElementException, ElementNotVisibleException): + break + + try: + fws = driver.find_element_by_id("stream-items-id") + except NoSuchElementException: + return + + soup = BeautifulSoup(fws.get_attribute("outerHTML")) + for follower in soup.ol: + if type(follower) == Tag: + div = follower.div + user_id = div["data-user-id"] + screen_name = div["data-screen-name"] + yield user_id, screen_name + + def get_followers(self, user_id, username): + filename = os.path.join("data", "users", user_id + ".txt") + if os.path.isfile(filename): + return filename + l = list(self.__get_followers(username)) + with open(filename, "w") as f: + for (fid, fname) in l: + f.write(fid + " " + fname + "\n") + return filename + + def process(self, filename): + with open(filename) as f: + for line in f: + values = line.strip().split() + self.get_followers(*values[:2]) + + +if __name__ == "__main__": + credentials = open("scraping_accounts.txt").readline().strip().split() + driver = Driver(*credentials[:2]) + # driver.get_followers("23302126", "flipper509") + print driver.get_profile(100, "thibauthorel") |
