diff options
Diffstat (limited to 'scraper.py')
| -rw-r--r-- | scraper.py | 86 |
1 files changed, 0 insertions, 86 deletions
diff --git a/scraper.py b/scraper.py deleted file mode 100644 index ee7dd8f..0000000 --- a/scraper.py +++ /dev/null @@ -1,86 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.support.wait import WebDriverWait -from selenium.common.exceptions import TimeoutException,\ - ElementNotVisibleException, NoSuchElementException -from bs4 import BeautifulSoup, Tag - -import os.path - - -class Driver: - - def __init__(self, username, password): - self.driver = webdriver.PhantomJS() - self.username = username - self.password = password - self.__connect() - - def __ajax_complete(self): - return 0 == self.driver.execute_script("return jQuery.active") - - def __connect(self): - driver = self.driver - driver.get("http://twitter.com") - driver.find_element_by_id("signin-email").send_keys(self.username) - elem = driver.find_element_by_id("signin-password") - elem.send_keys(self.password) - elem.submit() - - def __get_followers(self, username): - driver = self.driver - driver.get("https://twitter.com/{0}/followers".format(username)) - try: - footer = driver.find_element_by_class_name("timeline-end") - except NoSuchElementException: - return - - while True: - try: - if "has-more-items" not in footer.get_attribute("class"): - break - footer.click() - try: - WebDriverWait(driver, - 5).until(lambda x: self.__ajax_complete(), - "Timeout waiting for " - "ajax to return") - except TimeoutException: - break - except (NoSuchElementException, ElementNotVisibleException): - break - - try: - fws = driver.find_element_by_id("stream-items-id") - except NoSuchElementException: - return - - soup = BeautifulSoup(fws.get_attribute("outerHTML")) - for follower in soup.ol: - if type(follower) == Tag: - div = follower.div - user_id = div["data-user-id"] - screen_name = div["data-screen-name"] - yield user_id, screen_name - - def get_followers(self, user_id, username): - filename = os.path.join("data", "users", user_id + ".txt") - if os.path.isfile(filename): - return filename - l = list(self.__get_followers(username)) - with open(filename, "w") as f: - for (fid, fname) in l: - f.write(fid + " " + fname + "\n") - return filename - - def process(self, filename): - with open(filename) as f: - for line in f: - values = line.strip().split() - self.get_followers(*values[:2]) - - -if __name__ == "__main__": - credentials = open("scraping_accounts.txt").readline().strip().split() - driver = Driver(*credentials[:2]) - # driver.get_followers("23302126", "flipper509") - print driver.get_profile(100, "thibauthorel") |
